# Bootstrapping 

In [3]:
#%reset
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix
from num2words import num2words
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold, RepeatedStratifiedKFold
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
import word2number
from word2number import w2n
from sklearn.tree import DecisionTreeClassifier
import pickle

hfont = {'fontname':'Helvetica'}

Set wd to be a folder not on github

In [4]:
new_directory = '/Users/rem76/Documents/COVID_projections/Bootstrapping/'
os.chdir(new_directory)

In [5]:
def create_column_names(categories_for_subsetting, num_of_weeks):
    column_names = ['HSA_ID']

    for week in range(1, num_of_weeks + 1):
        week = num2words(week)
        for category in categories_for_subsetting:
            column_name = f'week_{week}_{category}'
            column_names.append(column_name)

    return column_names

def create_collated_weekly_data(pivoted_table, original_data, categories_for_subsetting, geography, column_names):
    collated_data = pd.DataFrame(index=range(51), columns=column_names)

    x = 0
    for geo in original_data[geography].unique():
        #matching_indices = [i for i, geo_col in enumerate(pivoted_table) if geo_col == geo]
        collated_data.loc[x, geography] = geo
        columns_to_subset = [f'{geo}_{category}' for category in categories_for_subsetting]
        j = 1
        try:
            for row in range(len(pivoted_table.loc[:, columns_to_subset])):
                collated_data.iloc[x, j:j + len(categories_for_subsetting)] = pivoted_table.loc[row, columns_to_subset]
                j += len(categories_for_subsetting)
        except:
            pass
        x += 1

    return collated_data


In [6]:
def calculate_metrics(confusion_matrix):
    # Extract values from the confusion matrix
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    TN = confusion_matrix[0, 0]
    FN = confusion_matrix[1, 0]

    # Calculate Sensitivity (True Positive Rate) and Specificity (True Negative Rate)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0

    # Calculate PPV (Precision) and NPV
    ppv = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    npv = TN / (TN + FN) if (TN + FN) > 0 else 0.0

    return sensitivity, specificity, ppv, npv

In [7]:

def prep_training_test_data_period(data, no_weeks, weeks_in_future, if_train, geography, weight_col, keep_output):
## Get the weeks for the x and y datasets   
    x_weeks = []  
    y_weeks = []
    y_weeks_to_check = [] #check these weeks to see if any of them are equal to 1
    for week in no_weeks:
        test_week = int(week) + weeks_in_future
        x_weeks.append('_' + num2words(week) + '_')
        for week_y in range(week+1, test_week+1):
                y_weeks_to_check.append('_' + num2words(week_y) + '_')
        y_weeks.append('_' + num2words(test_week) + '_')
    
## Divide up the test/train split
    #if is_geographic:
        # Calculate the index to start slicing from
    #    start_index = len(data['county']) // proportion[0] * proportion[1]
        # Divide up the dataset based on this proportion
    #    first_two_thirds = data['county'][:start_index]
    #    last_third = data['county'][start_index:]
    X_data = pd.DataFrame()
    y_data = pd.DataFrame()
    weights_all =  pd.DataFrame()
    missing_data = []
    ## Now get the training data 
    k = 0
    for x_week in x_weeks:
            y_week = y_weeks[k]
            k +=1

            weeks_x = [col for col in data.columns if x_week in col]
            columns_x  = [geography] + weeks_x + [weight_col]
            data_x = data[columns_x]

            weeks_y = [col for col in data.columns if y_week in col]
            columns_y  = [geography] + weeks_y
            data_y = data[columns_y]
            ### now add the final column to the y data that has it so that it's if any week in the trhee week perdiod exceeded 15
            train_week = w2n.word_to_num(x_week.replace("_", ""))
            target_week =  w2n.word_to_num(y_week.replace("_", ""))
            y_weeks_to_check = []
            for week_to_check in range(train_week + 1, target_week + 1):
                y_weeks_to_check.append('_' + num2words(week_to_check) + '_')

            y_weeks_to_check = [week + 'beds_over_15_100k' for week in y_weeks_to_check]
            columns_to_check = [col for col in data.columns if any(week in col for week in y_weeks_to_check)]
            y_over_in_period = data[columns_to_check].apply(max, axis=1)
            data_y = pd.concat([data_y, y_over_in_period], axis=1)
            # ensure they have the same amount of data
            #remove rows in test_data1 with NA in test_data2
            data_x = data_x.dropna()
            data_x = data_x[data_x[geography].isin(data_y[geography])]
            # remove rows in test_data2 with NA in test_data1
            data_y = data_y.dropna()
            data_y = data_y[data_y[geography].isin(data_x[geography])]
            data_x = data_x[data_x[geography].isin(data_y[geography])]
            data_x_no_HSA = len(data_x[geography].unique())

            missing_data.append(((len(data[geography].unique()) - data_x_no_HSA)/len(data[geography].unique())) * 100)
            # get weights 
            #weights = weight_data[weight_data[geography].isin(data_x[geography])][[geography, weight_col]]

            X_week = data_x.iloc[:, 1:len(columns_x)]  # take away y, leave weights for mo
            y_week = data_y.iloc[:, -1] 
            
            y_week = y_week.astype(int)
            if if_train:

                 X_week, y_week = oversample.fit_resample(X_week, y_week)
            weights = X_week.iloc[:, -1] 
            if keep_output:
                X_week = X_week.iloc[:, :len(X_week.columns)-1] # remove the weights and leave "target" for that week

                #rename columns for concatenation 
                X_week.columns = range(1, len(data_x.columns) -1)
            else:
                X_week = X_week.iloc[:, :len(X_week.columns)-2] # remove the weights and  "target" for that week

                X_week.columns = range(1, len(data_x.columns) -2)# remove the weights and  "target" for that week

            y_week.columns = range(1, len(data_y.columns) -2)
            X_data = pd.concat([X_data, X_week])
            y_data = pd.concat([y_data, y_week]) 
        
            weights_all =  pd.concat([weights_all, weights]) 


    X_data.reset_index(drop=True, inplace=True)
    y_data.reset_index(drop=True, inplace=True)
    weights_all.reset_index(drop=True, inplace=True)

    return(X_data, y_data, weights_all, missing_data)


### this code it's ANY in the x week period 
def prep_training_test_data(data, no_weeks, weeks_in_future, if_train, geography, weight_col, keep_output):
## Get the weeks for the x and y datasets   
    x_weeks = []  
    y_weeks = []
    for week in no_weeks:
        test_week = int(week) + weeks_in_future
        x_weeks.append('_' + num2words(week) + '_')
        y_weeks.append('_' + num2words(test_week) + '_')
    
    X_data = pd.DataFrame()
    y_data = pd.DataFrame()
    weights_all =  pd.DataFrame()
    missing_data = []
    ## Now get the training data \
    k = 0
    for x_week in x_weeks:
            y_week = y_weeks[k]
            k += 1
            weeks_x = [col for col in data.columns if x_week in col]
            columns_x  = [geography] + weeks_x + [weight_col]
            data_x = data[columns_x]

            weeks_y = [col for col in data.columns if y_week in col]
            columns_y  = [geography] + weeks_y
            data_y = data[columns_y]
            # ensure they have the same amount of data
            #remove rows in test_data1 with NA in test_data2
            data_x = data_x.dropna()
            data_x = data_x[data_x[geography].isin(data_y[geography])]
            # remove rows in test_data2 with NA in test_data1
            data_y = data_y.dropna()
            data_y = data_y[data_y[geography].isin(data_x[geography])]
            data_x = data_x[data_x[geography].isin(data_y[geography])]
            data_x_no_HSA = len(data_x[geography].unique())

            missing_data.append(((len(data[geography].unique()) - data_x_no_HSA)/len(data[geography].unique())) * 100)
            # get weights 
            #weights = weight_data[weight_data[geography].isin(data_x[geography])][[geography, weight_col]]

            X_week = data_x.iloc[:, 1:len(columns_x)]  # take away y, leave weights for mo
            y_week = data_y.iloc[:, -1] 
            
            y_week = y_week.astype(int)
            if if_train:

                 X_week, y_week = oversample.fit_resample(X_week, y_week)
            weights = X_week.iloc[:, -1] 
            if keep_output:
                X_week = X_week.iloc[:, :len(X_week.columns)-1] # remove the weights and leave "target" for that week

                #rename columns for concatenation 
                X_week.columns = range(1, len(data_x.columns) -1)
            else:
                X_week = X_week.iloc[:, :len(X_week.columns)-2] # remove the weights and  "target" for that week

                X_week.columns = range(1, len(data_x.columns) -2)# remove the weights and  "target" for that week

                #rename columns for concatenation 
            y_week.columns = range(1, len(data_y.columns) -1)
            X_data = pd.concat([X_data, X_week])
            y_data = pd.concat([y_data, y_week]) 
        
            weights_all =  pd.concat([weights_all, weights]) 


    X_data.reset_index(drop=True, inplace=True)
    y_data.reset_index(drop=True, inplace=True)
    weights_all.reset_index(drop=True, inplace=True)

    return(X_data, y_data, weights_all, missing_data)

In [47]:
def calculate_percentiles(iterations, model_name, ROC_actual, accuracy_actual, sensitivity_actual, specificity_actual, ppv_actual, npv_actual, X_test ,y_test):
        bootstrapped_stats_ROC = []
        bootstrapped_stats_accuracy = []
        bootstrapped_stats_sesitivity = []
        bootstrapped_stats_specificity = []
        bootstrapped_stats_ppv = []
        bootstrapped_stats_npv = []

        for j in iterations:
            model_name_to_load = model_name + "_" + str(j)+ ".sav" 
            model_fit = pickle.load(open(model_name_to_load, 'rb'))
            y_bootstrap_predict = model_fit.predict(X_test)
            y_bootstrap_predict_proba = model_fit.predict_proba(X_test)

            ROC_AUC_bootstrap_test_performance = metrics.roc_auc_score(y_test, y_bootstrap_predict_proba[:,1]) 
            accuracy_bootstrap_test_performance  = accuracy_score(y_test, y_bootstrap_predict)

            sensitivity_bootstrap_test_performance, specificity_bootstrap_test_performance, ppv_bootstrap_test_performance, npv_bootstrap_test_performance = calculate_metrics(confusion_matrix(y_test, y_bootstrap_predict))
        ### (D) Calculate estimate fo variance  by getting (B) - (D) 

            bootstrapped_stats_ROC.append({'Difference': ROC_AUC_bootstrap_test_performance - ROC_actual}) ## according to https://ocw.mit.edu/courses/18-05-introduction-to-probability-and-statistics-spring-2014/resources/mit18_05s14_reading24/
            bootstrapped_stats_accuracy.append({'Difference': accuracy_bootstrap_test_performance - accuracy_actual}) ## according to https://ocw.mit.edu/courses/18-05-introduction-to-probability-and-statistics-spring-2014/resources/mit18_05s14_reading24/
            bootstrapped_stats_sesitivity.append({'Difference': sensitivity_bootstrap_test_performance - sensitivity_actual}) ## according to https://ocw.mit.edu/courses/18-05-introduction-to-probability-and-statistics-spring-2014/resources/mit18_05s14_reading24/
            bootstrapped_stats_specificity.append({'Difference': specificity_bootstrap_test_performance - specificity_actual}) ## according to https://ocw.mit.edu/courses/18-05-introduction-to-probability-and-statistics-spring-2014/resources/mit18_05s14_reading24/
            bootstrapped_stats_ppv.append({'Difference': ppv_bootstrap_test_performance - ppv_actual}) ## according to https://ocw.mit.edu/courses/18-05-introduction-to-probability-and-statistics-spring-2014/resources/mit18_05s14_reading24/
            bootstrapped_stats_npv.append({'Difference': npv_bootstrap_test_performance - npv_actual}) ## according to https://ocw.mit.edu/courses/18-05-introduction-to-probability-and-statistics-spring-2014/resources/mit18_05s14_reading24/


        bootstrapped_stats_ROC = pd.DataFrame(bootstrapped_stats_ROC)
        bootstrapped_stats_accuracy = pd.DataFrame(bootstrapped_stats_accuracy)
        bootstrapped_stats_sesitivity = pd.DataFrame(bootstrapped_stats_sesitivity)
        bootstrapped_stats_specificity = pd.DataFrame(bootstrapped_stats_specificity)
        bootstrapped_stats_ppv = pd.DataFrame(bootstrapped_stats_ppv)
        bootstrapped_stats_npv = pd.DataFrame(bootstrapped_stats_npv)

    ## Step 3: Get percentile
        alpha = 0.05

        upper_quartile_ROC, lower_quartile_ROC = ROC_actual - np.percentile(bootstrapped_stats_ROC["Difference"], [100 * (1 - alpha / 2.0), 100 * alpha / 2.0])
        upper_quartile_accuracy, lower_quartile_accuracy = accuracy_actual - np.percentile(bootstrapped_stats_accuracy["Difference"], [100 * (1 - alpha / 2.0), 100 * alpha / 2.0])
        upper_quartile_sensitivity, lower_quartile_sensitivity = sensitivity_actual - np.percentile(bootstrapped_stats_sesitivity["Difference"], [100 * (1 - alpha / 2.0), 100 * alpha / 2.0])
        upper_quartile_specificity, lower_quartile_specificity = specificity_actual - np.percentile(bootstrapped_stats_specificity["Difference"], [100 * (1 - alpha / 2.0), 100 * alpha / 2.0])
        upper_quartile_ppv, lower_quartile_ppv = ppv_actual - np.percentile(bootstrapped_stats_ppv["Difference"], [100 * (1 - alpha / 2.0), 100 * alpha / 2.0])
        upper_quartile_npv, lower_quartile_npv = npv_actual - np.percentile(bootstrapped_stats_npv["Difference"], [100 * (1 - alpha / 2.0), 100 * alpha / 2.0])
        ## Step 4: Get optimization-corrected performance

        return upper_quartile_ROC, lower_quartile_ROC, upper_quartile_accuracy, lower_quartile_accuracy, upper_quartile_sensitivity, lower_quartile_sensitivity, upper_quartile_specificity, lower_quartile_specificity, upper_quartile_ppv, lower_quartile_ppv, upper_quartile_npv, lower_quartile_npv

### now try bootstrapping w/o feature selection
iterations = 100
## DO NOT SAMPLE THE TARGET DATA
def bootstrap_no_dev(iterations, clf, param_grid, cv, iterations_param_search, data,model_name, time_period, no_weeks, keep_output, weeks_in_future, if_train, geography, weight_col):
      #1. Get dataset
    for j in iterations:
        #2. (A) Sample all individuals from training data w/replacement
          if time_period == 'period':
                X_sample_train, y_sample_train, weights_train, missing_data_train_HSA = prep_training_test_data_period(data, no_weeks = no_weeks, weeks_in_future = weeks_in_future, if_train = if_train, geography = geography, weight_col = weight_col,keep_output = keep_output)
          elif time_period == 'exact':
                X_sample_train, y_sample_train, weights_train, missing_data_train_HSA = prep_training_test_data(data, no_weeks = no_weeks, weeks_in_future = weeks_in_future, if_train = if_train, geography = geography, weight_col =weight_col,keep_output = keep_output)
          elif time_period == 'shifted':
                X_sample_train, y_sample_train, weights_train, missing_data_train_HSA = prep_training_test_data_period(data, no_weeks = no_weeks, weeks_in_future = weeks_in_future, if_train = if_train, geography = geography, weight_col = weight_col,keep_output = keep_output)
                y_sample_train = y_sample_train.shift(-1)
                y_sample_train.drop(index=y_sample_train.index[-1], inplace=True)
                X_sample_train.drop(index=X_sample_train.index[-1], inplace=True)# want these data to stay the same
                weights_train.drop(index=weights_train.index[-1], inplace=True)

        # need to rejoin, resample, then seperate 
          training_data = pd.merge(X_sample_train, y_sample_train, left_index=True, right_index=True)
          training_data = pd.merge(training_data, weights_train, left_index=True, right_index=True)

          training_data_resampled = training_data.sample(frac = 1, replace=True)
          weights_train = training_data_resampled.iloc[:,-1:] ##(a) sample n individuals with replacement
          weights_train = weights_train.iloc[:,0]
          training_data_resampled.drop(training_data_resampled.columns[-1],axis=1, inplace = True)
          y_sample_train = training_data_resampled.iloc[:,-1:]
          training_data_resampled.drop(training_data_resampled.columns[-1],axis=1, inplace = True)
          X_sample_train = training_data_resampled
        #  (B) Predictive model 
          random_search = RandomizedSearchCV(clf, param_grid, n_iter=iterations_param_search, cv=cv, random_state=10)
          random_search.fit(X_sample_train, y_sample_train, sample_weight = weights_train)

          best_params = random_search.best_params_

# Create the Decision Tree classifier with the best hyperparameters
          model = DecisionTreeClassifier(**best_params,random_state=10, class_weight='balanced')
          model_fit = model.fit(X_sample_train, y_sample_train, sample_weight=weights_train)

          model_name_to_save = model_name + "_" + str(j)+ ".sav" 
          X_data_name = model_name + "_X_data_" + str(j) + ".csv" 
          y_data_name = model_name + "_y_data_" + str(j) + ".csv" 
          weights_data_name = model_name + "_weights_" + str(j) + ".csv" 
          
          weights_train.to_csv(weights_data_name, index=False)
          X_sample_train.to_csv(X_data_name,index=False)
          y_sample_train.to_csv(y_data_name, index=False)
          pickle.dump(model_fit, open(model_name_to_save, 'wb'))


In [10]:
def merge_and_rename_data(data1, data2, on_column, suffix1, suffix2):
    merged_data = pd.merge(data1, data2, on=on_column, suffixes=('_'+suffix1, '_'+suffix2))

    new_column_names = [col.replace(f'_{on_column}_{suffix1}', f'_{suffix1}').replace(f'_{on_column}_{suffix2}', f'_{suffix2}') for col in merged_data.columns]
    merged_data.rename(columns=dict(zip(merged_data.columns, new_column_names)), inplace=True)

    return merged_data

def pivot_data_by_HSA(data, index_column, columns_column, values_column):
    data_by_HSA = data[[index_column, columns_column, values_column]]
    pivot_table = data_by_HSA.pivot_table(index=index_column, columns=columns_column, values=values_column)
    return pivot_table

In [11]:
def add_changes_by_week(weekly_data_frame, outcome_column):

    for column in weekly_data_frame.columns[1:]:
        # Calculate the difference between each row and the previous row
        if outcome_column not in column.lower(): #want to leave out the outcome column
            diff = weekly_data_frame[column].diff()
            
            # Create a new column with the original column name and "delta"
            new_column_name = column + "_delta"
            
            column_index = weekly_data_frame.columns.get_loc(column)
            
            # Insert the new column just after the original column
            weekly_data_frame.insert(column_index + 1, new_column_name, diff)
            weekly_data_frame[new_column_name] = diff
    return weekly_data_frame

# Naive classifier bootstrapping 
- Cannot bootstrap - single, binary predictor 

# Needed data

In [17]:
param_grid = {
    'criterion': ['gini'],#,  'entropy'],
    'max_depth': np.arange(1, 10),
    'min_samples_split':  np.arange(2, 300), #[100, 200, 300, 400, 500], #np.arange(50, 200),
    'min_samples_leaf':  np.arange(2, 400)}#, #100, 200, 300, 400, 500], #np.arange(500, 200)
    #'ccp_alpha': np.arange(0.0001, 0.0035, 0.0001) }


# Create the Decision Tree classifier
cv = RepeatedStratifiedKFold(n_splits=10,  n_repeats=10,random_state=1) ## 10-fold cross validations


# Try CDC Classifier 

In [56]:
data_by_HSA = pd.read_csv('/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates.csv')
data_by_HSA['health_service_area_number']
data_by_HSA['health_service_area']
#data_by_HSA['HSA_ID'] = data_by_HSA['health_service_area_number'].astype(str) + '' + data_by_HSA['health_service_area'].apply(lambda x: x.split()[0])
data_by_HSA.rename(columns={'health_service_area_number': 'HSA_ID'}, inplace=True)

data_by_HSA['beds_over_15_100k'] = (data_by_HSA['beds_weekly'] > 15)*1

# remove HSAs that have missing data in specific columns

data_by_HSA = data_by_HSA.dropna(subset=['admits_weekly', 'deaths_weekly', 'cases_weekly', 'icu_weekly', 'beds_weekly', 'perc_covid'])

for i, week in enumerate(data_by_HSA['date'].unique()):
    data_by_HSA.loc[data_by_HSA['date'] == week, 'week'] = i

    ## pivot 
data_by_HSA_cases = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'cases_weekly')
data_by_HSA_admissions = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'admits_weekly')
data_by_HSA_percent_beds = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'perc_covid')
data_by_HSA_over_15_100k = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'beds_over_15_100k')

## merge 
data_by_HSA_cases_admits = merge_and_rename_data(data_by_HSA_cases, data_by_HSA_admissions,'week','cases', 'admits')
data_by_HSA_admits_perc_outcome = merge_and_rename_data(data_by_HSA_percent_beds, data_by_HSA_over_15_100k,'week','perc_covid', 'beds_over_15_100k')
data_by_HSA_cases_admits_perc_outcome= pd.merge(data_by_HSA_cases_admits, data_by_HSA_admits_perc_outcome, on='week')


data_by_HSA_cases_admits_perc_outcome = data_by_HSA_cases_admits_perc_outcome.reset_index()
data_by_HSA_cases_admits_perc_outcome.columns = data_by_HSA_cases_admits_perc_outcome.columns.str.replace(',', '')

categories_for_subsetting = ['cases', 'admits','perc_covid', 'beds_over_15_100k']
num_of_weeks = len(data_by_HSA_cases_admits_perc_outcome)
column_names = create_column_names(categories_for_subsetting, num_of_weeks)

all_HSA_ID_weekly_data = create_collated_weekly_data(data_by_HSA_cases_admits_perc_outcome, data_by_HSA, categories_for_subsetting, 'HSA_ID', column_names)

weights_df = data_by_HSA[data_by_HSA['HSA_ID'].isin(all_HSA_ID_weekly_data['HSA_ID'])][['HSA_ID','weight_alt']]
weights_df = weights_df.rename(columns = {'HSA_ID': 'HSA_ID', 'weight_alt':'weight'})
weights_df = weights_df.drop_duplicates()
weights_df['weight'].unique()
all_HSA_ID_weekly_data = all_HSA_ID_weekly_data.join(weights_df['weight'])

  data_by_HSA = pd.read_csv('/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates.csv')


Optimized classifier

In [59]:
X_train, y_train, weights, missing_data_train_HSA = prep_training_test_data(all_HSA_ID_weekly_data,  no_weeks = range(1, int(123*2/3) + 1), weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight', keep_output = False)

X_test, y_test, weights_test, missing_data_test_HSA = prep_training_test_data(all_HSA_ID_weekly_data,  no_weeks = range(int(123*2/3) + 1, 120), weeks_in_future = 3, if_train = False, geography = 'HSA_ID',  weight_col = 'weight', keep_output = False)
weights = weights[0].to_numpy()

In [23]:
CDC_exact = pickle.load(open("/Users/rem76/Documents/COVID_projections/COVID_forecasting/CDC_optimized_exact_auroc_0.8269_pruned.sav", 'rb'))
bootstrap_no_dev(iterations =range(0,100), clf = CDC_exact,  param_grid = param_grid,  cv = cv , iterations_param_search = 10,data = all_HSA_ID_weekly_data, model_name = "Optimized_CDC_classifier_exact_boostrap", time_period = 'exact', no_weeks = range(1, int(123*2/3) + 1),keep_output = False, weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight')

In [60]:
calculate_percentiles(iterations = range(0,100), model_name = "Optimized_CDC_classifier_exact_boostrap", ROC_actual = 0.8269, accuracy_actual = 0.758, sensitivity_actual = 0.757, specificity_actual = 0.762, ppv_actual = 0.870, npv_actual = 0.599, X_test = X_test, y_test =y_test)

(0.8237764055669351,
 0.8713994771420005,
 0.7432406617945595,
 0.7911488699418054,
 0.7204712637938783,
 0.8187186298497029,
 0.72609098541295,
 0.8105410851086158,
 0.8610524852530571,
 0.8916781952553877,
 0.5711349068555243,
 0.6465129179251549)

# Enhanced CDC optimizer

In [62]:
CDC_exact_enhanced = pickle.load(open("/Users/rem76/Documents/COVID_projections/COVID_forecasting/CDC_optimized_exact_enhanced_auroc_0.8280_pruned.sav", 'rb'))
X_train, y_train, weights, missing_data_train_HSA = prep_training_test_data(all_HSA_ID_weekly_data,  no_weeks = range(1, int(123*2/3) + 1), weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight', keep_output = True)

X_test, y_test, weights_test, missing_data_test_HSA = prep_training_test_data(all_HSA_ID_weekly_data,  no_weeks = range(int(123*2/3) + 1, 120), weeks_in_future = 3, if_train = False, geography = 'HSA_ID',  weight_col = 'weight', keep_output = True)
weights = weights[0].to_numpy()

In [20]:
bootstrap_no_dev(iterations =range(0,100), clf = CDC_exact_enhanced,  param_grid = param_grid,  cv = cv , iterations_param_search = 10,data = all_HSA_ID_weekly_data, model_name = "Optimized_CDC_classifier_enhanced_exact_boostrap", time_period = 'exact', no_weeks = range(1, int(123*2/3) + 1),keep_output = True, weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight')

In [64]:
calculate_percentiles(iterations = range(0,100), model_name = "Optimized_CDC_classifier_enhanced_exact_boostrap", ROC_actual = 0.8280, accuracy_actual = 0.7899, sensitivity_actual =  0.873, specificity_actual = 0.631, ppv_actual = 0.833, npv_actual = 0.702, X_test = X_test, y_test =y_test)

(0.8227005411620747,
 0.8733634891878018,
 0.7850713493030181,
 0.8310738530247667,
 0.8733780396464772,
 0.9924061017626205,
 0.5072775737223214,
 0.6309789064959597,
 0.7974115047131706,
 0.8334923780487804,
 0.7018748248482017,
 0.8160039231066827)

# Full Classifier 

In [24]:
data_by_HSA = pd.read_csv('/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates.csv')
data_by_HSA['health_service_area_number']
data_by_HSA['health_service_area']
#data_by_HSA['HSA_ID'] = data_by_HSA['health_service_area_number'].astype(str) + '' + data_by_HSA['health_service_area'].apply(lambda x: x.split()[0])
data_by_HSA.rename(columns={'health_service_area_number': 'HSA_ID'}, inplace=True)

data_by_HSA['beds_over_15_100k'] = (data_by_HSA['beds_weekly'] > 15)*1

data_by_HSA = data_by_HSA.dropna(subset=['admits_weekly', 'deaths_weekly', 'cases_weekly', 'icu_weekly', 'beds_weekly', 'perc_covid'])

for i, week in enumerate(data_by_HSA['date'].unique()):
    data_by_HSA.loc[data_by_HSA['date'] == week, 'week'] = i


## pivot 
data_by_HSA_cases = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'cases_weekly')
data_by_HSA_deaths = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'deaths_weekly')
data_by_HSA_admissions = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'admits_weekly')
data_by_HSA_icu = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'icu_weekly')
data_by_HSA_beds = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'beds_weekly')
data_by_HSA_percent_beds = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'perc_covid')
data_by_HSA_over_15_100k = pivot_data_by_HSA(data_by_HSA, 'week', 'HSA_ID', 'beds_over_15_100k')

## merge 
data_by_HSA_cases_deaths = merge_and_rename_data(data_by_HSA_cases, data_by_HSA_deaths,'week','cases', 'deaths')
data_by_HSA_admits_icu_weekly = merge_and_rename_data(data_by_HSA_admissions, data_by_HSA_icu,'week','admits', 'icu')
data_by_HSA_beds_perc_weekly = merge_and_rename_data(data_by_HSA_beds, data_by_HSA_percent_beds,'week','beds', 'perc_covid')
data_by_HSA_cases_deaths_admits_icu = pd.merge(data_by_HSA_cases_deaths, data_by_HSA_admits_icu_weekly, on='week')
data_by_HSA_cases_deaths_admits_icu_beds = pd.merge(data_by_HSA_cases_deaths_admits_icu, data_by_HSA_beds_perc_weekly, on='week')

## add outcome variable 

old_column_names = data_by_HSA_over_15_100k.columns
new_column_names = [str(col) + '_beds_over_15_100k' for col in old_column_names]
new_column_names = dict(zip(old_column_names, new_column_names))
data_by_HSA_over_15_100k.rename(columns=new_column_names, inplace=True)
data_by_HSA_cases_deaths_admits_icu_beds = pd.merge(data_by_HSA_cases_deaths_admits_icu_beds, data_by_HSA_over_15_100k, on='week')

data_by_HSA_cases_deaths_admits_icu_beds = data_by_HSA_cases_deaths_admits_icu_beds.reset_index()
data_by_HSA_cases_deaths_admits_icu_beds.columns = data_by_HSA_cases_deaths_admits_icu_beds.columns.str.replace(',', '')

categories_for_subsetting = ['cases', 'deaths', 'admits', 'icu', 'beds', 'perc_covid', 'beds_over_15_100k']
num_of_weeks = len(data_by_HSA_cases_deaths_admits_icu_beds)
column_names = create_column_names(categories_for_subsetting, num_of_weeks)

all_HSA_ID_weekly_data = create_collated_weekly_data(data_by_HSA_cases_deaths_admits_icu_beds, data_by_HSA, categories_for_subsetting, 'HSA_ID', column_names)

all_HSA_ID_weekly_data = add_changes_by_week(all_HSA_ID_weekly_data, "beds_over_15_100k")

weights_df = data_by_HSA[data_by_HSA['HSA_ID'].isin(all_HSA_ID_weekly_data['HSA_ID'])][['HSA_ID','weight_alt']]
weights_df = weights_df.rename(columns = {'HSA_ID': 'HSA_ID', 'weight_alt':'weight'})
weights_df = weights_df.drop_duplicates()
weights_df['weight'].unique()
all_HSA_ID_weekly_data = all_HSA_ID_weekly_data.join(weights_df['weight'])

  data_by_HSA = pd.read_csv('/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates.csv')
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_fr

Period 

In [34]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(2,20),
    'min_samples_split': np.arange(2, 2000, 50), #[100, 200, 300, 400, 500], #np.arange(50, 200),
    'min_samples_leaf':  np.arange(2, 2000, 50)} #100, 200, 300, 400, 500], #np.arange(500, 200)
# Create the Decision Tree classifier

In [50]:
Full_period = pickle.load(open("/Users/rem76/Documents/COVID_projections/COVID_forecasting/Full_auroc_0.9140_period_pruned.sav", 'rb'))
X_train, y_train, weights, missing_data_train_HSA = prep_training_test_data_period(all_HSA_ID_weekly_data,  no_weeks = range(1, int(123*2/3) + 1), weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight', keep_output = True)

X_test, y_test, weights_test, missing_data_test_HSA = prep_training_test_data_period(all_HSA_ID_weekly_data,  no_weeks = range(int(123*2/3) + 1, 120), weeks_in_future = 3, if_train = False, geography = 'HSA_ID',  weight_col = 'weight', keep_output = True)
weights = weights[0].to_numpy()

In [35]:
bootstrap_no_dev(iterations =range(0,100), clf = Full_period,  param_grid = param_grid,  cv = cv , iterations_param_search = 10,data = all_HSA_ID_weekly_data, model_name = "Full_classifier_period_boostrap", time_period = 'period', no_weeks = range(1, int(123*2/3) + 1),keep_output = True, weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight')

In [51]:
calculate_percentiles(iterations = range(0,100), model_name = "Full_classifier_period_boostrap", ROC_actual = 0.914, accuracy_actual = 0.803, sensitivity_actual = 0.779, specificity_actual = 0.899, ppv_actual = 0.969, npv_actual = 0.502, X_test = X_test, y_test =y_test)

(0.9139349697691534,
 0.9472627850316913,
 0.7881362987145482,
 0.8304159355233627,
 0.7552374167832762,
 0.8178142729932579,
 0.8768253477588872,
 0.9177879100120213,
 0.9635627321536879,
 0.9740767468931384,
 0.4800760295259304,
 0.5378445327857033)

Exact 

In [65]:
Full_exact = pickle.load(open("/Users/rem76/Documents/COVID_projections/COVID_forecasting/Full_auroc_0.8594_exact_pruned.sav", 'rb'))


In [68]:

X_test, y_test, weights_test, missing_data_test_HSA = prep_training_test_data(all_HSA_ID_weekly_data,  no_weeks = range(int(123*2/3) + 1, 120), weeks_in_future = 3, if_train = False, geography = 'HSA_ID',  weight_col = 'weight', keep_output = True)

bootstrap_no_dev(iterations =range(0,100), clf = Full_exact,  param_grid = param_grid,  cv = cv , iterations_param_search = 10,data = all_HSA_ID_weekly_data, model_name = "Full_classifier_exact_boostrap", time_period = 'exact', no_weeks = range(1, int(123*2/3) + 1),keep_output = True, weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight')

0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7379752859927271
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7380002522882282
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7541161676463399
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7372190867965774
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7379113860052178
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7346276382482348
0.7518215269287816
0.7518215269287816
0.736286444090697
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7518215269287816
0.7381102727185651
0.7518215269287816
0.7518215269287816
0.73741373953

In [None]:
calculate_percentiles(iterations = range(0,100), model_name = "Full_classifier_exact_boostrap", ROC_actual = 0.783, accuracy_actual = 0.762, sensitivity_actual = 0.722, specificity_actual = 0.846, ppv_actual = .908, npv_actual = 0.591, X_test = X_test, y_test =y_test)

Shifted

In [40]:
Full_shifted = pickle.load(open("/Users/rem76/Documents/COVID_projections/COVID_forecasting/Full_auroc_0.7268_shifted_pruned.sav", 'rb'))
bootstrap_no_dev(iterations =range(0,100), clf = Full_shifted,  param_grid = param_grid,  cv = cv , iterations_param_search = 10,data = all_HSA_ID_weekly_data, model_name = "Full_classifier_shifted_boostrap", time_period = 'shifted', no_weeks = range(1, int(123*2/3) + 1),keep_output = True, weeks_in_future = 3, if_train = False, geography = 'HSA_ID', weight_col = 'weight')

In [41]:
X_test, y_test, weights_test, missing_data_test_HSA = prep_training_test_data_period(all_HSA_ID_weekly_data,  no_weeks = range(int(123*2/3) + 1, 120), weeks_in_future = 3, if_train = False, geography = 'HSA_ID',  weight_col = 'weight', keep_output = True)

y_test = y_test.shift(-1)

y_test.drop(index=y_test.index[-1], inplace=True)
X_test.drop(index=X_test.index[-1], inplace=True)

In [48]:
calculate_percentiles(iterations = range(0,100), model_name = "Full_classifier_shifted_boostrap", ROC_actual = 0.7268, accuracy_actual = 0.573, sensitivity_actual = 0.509, specificity_actual = 0.833, ppv_actual = 0.925, npv_actual = 0.295, X_test = X_test, y_test =y_test)

(0.6921497173691434,
 0.7336733737276049,
 0.5184196565209997,
 0.6347663662642407,
 0.43228632007463325,
 0.5958390297684675,
 0.7883209685729005,
 0.8971136871028679,
 0.914165804249595,
 0.9415585029396056,
 0.26846141179683686,
 0.32056097066054867)