# Train the decision trees based on the latest data available from: https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/anag-cw7u

NB - there is no longer regular updating of case and death data. Must omit from model training. 

In [60]:
#%reset

In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef
from num2words import num2words
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold, RepeatedStratifiedKFold
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
import word2number
from word2number import w2n
from sklearn.tree import DecisionTreeClassifier
import pickle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import RocCurveDisplay
import random
from matplotlib.patches import Polygon

from Functions import prep_training_test_data_period, prep_training_test_data, calculate_metrics,cross_validation_leave_geo_out, prep_training_test_data_shifted, add_labels_to_subplots, LOOCV_by_HSA_dataset, save_in_HSA_dictionary, pivot_data_by_HSA, merge_and_rename_data, add_changes_by_week, create_column_names, create_collated_weekly_data
hfont = {'fontname':'Helvetica'}
palette = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854', '#e5c494']

# Import and prepare data

In [62]:
HSA_weekly_data_updated = pd.read_csv("/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates_with_state_fips_latest_data.csv")


  HSA_weekly_data_updated = pd.read_csv("/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates_with_state_fips_latest_data.csv")


In [2]:
HSA_weekly_data_updated = pd.read_csv("/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates_with_state_fips.csv")


  HSA_weekly_data_updated = pd.read_csv("/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates_with_state_fips.csv")


In [3]:
HSA_weekly_data_updated.columns


Index(['Unnamed: 0', 'date', 'health_service_area_number',
       'health_service_area', 'health_service_area_population', 'cases_avg',
       'deaths_avg', 'cases_avg_per_100k', 'deaths_avg_per_100k',
       'POPESTIMATE2019', 'ymd', 'year', 'week', 'year_wk', 'state', 'fips',
       'admits_confirmed_avg', 'perc_covid', 'admits_confirmed_100K',
       'icu_confirmed_avg', 'icu_100K', 'beds_100k', 'cdc_flag_1',
       'cdc_flag_2', 'cdc_flag', 'deaths_21_lag_100k', 'icu_21_lag_100K',
       'cases_lag_21_100K', 'admits_7_lag', 'admits_7d_ago', 'admits_21d_ago',
       'admits_28d_ago', 'dotw', 'chk', 'county_rank',
       'deaths_21_lag_100k_14d', 'deaths_weekly', 'admits_weekly',
       'cases_weekly', 'icu_weekly', 'beds_weekly', 'perc_covid_100', 'cfr',
       'half_zeke_time_3', 'chk2', 'zeke_time_3', 'two_zeke_time_3',
       'icu_2_time_3', 'perc_covid_10_time_3', 'change_admits', 'change_perc',
       'change_cases', 'zeke_time_3_14d', 'two_zeke_time_3_14d', 'weight',
       'w

In [4]:
HSA_weekly_data_updated.rename(columns={'health_service_area_number': 'HSA_ID'}, inplace=True)
HSA_weekly_data_updated['beds_over_15_100k'] = (HSA_weekly_data_updated['beds_weekly'] > 15)*1
HSA_weekly_data_updated_features = HSA_weekly_data_updated.dropna(subset=['admits_weekly', 'icu_weekly', 'beds_weekly', 'perc_covid'])
for i, week in enumerate(HSA_weekly_data_updated_features['date'].unique()):
    HSA_weekly_data_updated_features.loc[HSA_weekly_data_updated_features['date'] == week, 'week'] = i

Merge dataframes

In [5]:
## pivot 
data_by_HSA_admissions = pivot_data_by_HSA(HSA_weekly_data_updated_features, 'week', 'HSA_ID', 'admits_weekly')
data_by_HSA_icu = pivot_data_by_HSA(HSA_weekly_data_updated_features, 'week', 'HSA_ID', 'icu_weekly')
data_by_HSA_beds = pivot_data_by_HSA(HSA_weekly_data_updated_features, 'week', 'HSA_ID', 'beds_weekly')
data_by_HSA_percent_beds = pivot_data_by_HSA(HSA_weekly_data_updated_features, 'week', 'HSA_ID', 'perc_covid')
data_by_HSA_over_15_100k = pivot_data_by_HSA(HSA_weekly_data_updated_features, 'week', 'HSA_ID', 'beds_over_15_100k')

## merge 
data_by_HSA_admits_icu_weekly = merge_and_rename_data(data_by_HSA_admissions, data_by_HSA_icu,'week','admits', 'icu')
data_by_HSA_beds_perc_weekly = merge_and_rename_data(data_by_HSA_beds, data_by_HSA_percent_beds,'week','beds', 'perc_covid')
data_by_HSA_cases_beds_perc_admits_icu = pd.merge(data_by_HSA_beds_perc_weekly, data_by_HSA_admits_icu_weekly, on='week')

## add outcome variable 

old_column_names = data_by_HSA_over_15_100k.columns
new_column_names = [str(col) + '_beds_over_15_100k' for col in old_column_names]
new_column_names = dict(zip(old_column_names, new_column_names))
data_by_HSA_over_15_100k.rename(columns=new_column_names, inplace=True)
data_by_HSA_cases_admits_icu_beds = pd.merge(data_by_HSA_cases_beds_perc_admits_icu, data_by_HSA_over_15_100k, on='week')

data_by_HSA_cases_admits_icu_beds = data_by_HSA_cases_admits_icu_beds.reset_index()
data_by_HSA_cases_admits_icu_beds.columns = data_by_HSA_cases_admits_icu_beds.columns.str.replace(',', '')

Get weekly changes

In [6]:
all_HSA_ID_weekly_data = add_changes_by_week(data_by_HSA_cases_admits_icu_beds, "beds_over_15_100k")

  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new_column_name, diff)
  weekly_data_frame.insert(column_index + 1, new

In [7]:
categories_for_subsetting = [ 'admits', 'icu', 'beds', 'perc_covid',  'admits_delta', 'icu_delta', 'beds_delta', 'perc_covid_delta','beds_over_15_100k']
num_of_weeks = len(all_HSA_ID_weekly_data)
column_names = create_column_names(categories_for_subsetting, num_of_weeks)

In [8]:
all_HSA_ID_weekly_data = create_collated_weekly_data(all_HSA_ID_weekly_data, HSA_weekly_data_updated, categories_for_subsetting, 'HSA_ID', column_names)

Add weights 

In [9]:
weights_df = HSA_weekly_data_updated[HSA_weekly_data_updated['HSA_ID'].isin(all_HSA_ID_weekly_data['HSA_ID'])][['HSA_ID','weight']]
weights_df = weights_df.rename(columns = {'HSA_ID': 'HSA_ID', 'weight':'weight'})
weights_df = weights_df.drop_duplicates()
weights_df['weight'].unique()
all_HSA_ID_weekly_data = all_HSA_ID_weekly_data.join(weights_df['weight'])

Save file

In [10]:
# write a csv file with all the data
all_HSA_ID_weekly_data.to_csv("/Users/rem76/Documents/COVID_projections/hsa_time_data_all_dates_with_state_fips_latest_data.csv", index=False)

# Maximum week

In [81]:
HSA_weekly_data_updated_features['week'].max()

198

# Get DTC

In [70]:
no_iterations = 10
geography_column = 'HSA_ID'  
geo_split = 0.9  
time_period = 'period'  # Choose 'period', 'exact', or 'shifted'
size_of_test_dataset = 1
train_weeks_for_initial_model = 1
weeks_in_future = 3 
weight_col = 'weight'  
keep_output = True  

no_iterations_param = 100  # Replace with the number of iterations for RandomizedSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(2, 5, 1),
    'min_samples_split': np.arange(200, 2000, 50), #[100, 200, 300, 400, 500], #np.arange(50, 200),
    'min_samples_leaf':  np.arange(200, 2000, 50)} #100, 200, 300, 400, 500], #np.arange(500, 200)
    #'ccp_alpha': np.arange(0.0001, 0.0035, 0.0001) }
weeks_to_predict = range(1, 123 - size_of_test_dataset - 3 - train_weeks_for_initial_model)

# Create the Decision Tree classifier
cv = RepeatedStratifiedKFold(n_splits=10,  n_repeats=10,random_state=1) ## 10-fold cross validations

In [71]:
ROC_by_week_full_period = []
sensitivity_by_week_full_period = []
specificity_by_week_full_period = []
ppv_by_week_full_period = []
npv_by_week_full_period = []
accuracy_by_week_full_period = []
norm_MCC_by_week_full_period = []

In [91]:

######### ACTUAL RUNS ############
weeks_to_predict = [195]
ROC_by_week_full_period = []
sensitivity_by_week_full_period = []
specificity_by_week_full_period = []
ppv_by_week_full_period = []
npv_by_week_full_period = []
accuracy_by_week_full_period = []
norm_MCC_by_week_full_period = []
clf_full_period = DecisionTreeClassifier(random_state=10, class_weight="balanced")
for prediction_week in weeks_to_predict:
    print(prediction_week)
    no_weeks_train = range(1, int(prediction_week + train_weeks_for_initial_model) + 1)
    no_weeks_test = range(
        int(prediction_week + train_weeks_for_initial_model) + 1,
        int(prediction_week + train_weeks_for_initial_model + size_of_test_dataset) + 1,
    )
    (
        X_train_full_period,
        y_train_full_period,
        weights_full_period,
        missing_data_train_HSA,
    ) = prep_training_test_data_period(
        all_HSA_ID_weekly_data,
        no_weeks=no_weeks_train,
        weeks_in_future=3,
        geography="HSA_ID",
        weight_col="weight",
        keep_output=True,
    )

    (
        X_test_full_period,
        y_test_full_period,
        weights_test_full_period,
        missing_data_test_HSA,
    ) = prep_training_test_data_period(
        all_HSA_ID_weekly_data,
        no_weeks=no_weeks_test,
        weeks_in_future=3,
        geography="HSA_ID",
        weight_col="weight",
        keep_output=True,
    )
    weights_full_period = weights_full_period[0].to_numpy()
    best_params = cross_validation_leave_geo_out(
        all_HSA_ID_weekly_data,
        geography_column=geography_column,
        geo_split=geo_split,
        no_iterations=no_iterations,
        cv=cv,
        classifier=clf_full_period,
        param_grid=param_grid,
        no_iterations_param=no_iterations_param,
        no_weeks_train=no_weeks_train,
        no_weeks_test=no_weeks_test,
        weeks_in_future=weeks_in_future,
        weight_col=weight_col,
        keep_output=keep_output,
        time_period="period",
    )
    clf_full_period = DecisionTreeClassifier(
        **best_params, random_state=10, class_weight="balanced"
    )
    clf_full_period.fit(
        X_train_full_period, y_train_full_period, sample_weight=weights_full_period
    )

    # Make predictions on the test set
    y_pred = clf_full_period.predict(X_test_full_period)
    y_pred_proba = clf_full_period.predict_proba(X_test_full_period)

    # Evaluate the accuracy of the model
    accuracy_by_week_full_period.append(accuracy_score(y_test_full_period, y_pred))
    ROC_by_week_full_period.append(
        roc_auc_score(y_test_full_period, y_pred_proba[:, 1])
    )
    conf_matrix = confusion_matrix(y_test_full_period, y_pred)

    model_name_to_save = (
        "Full_model_" + time_period + "_" + str(prediction_week) + ".sav"
    )

    pickle.dump(clf_full_period, open(model_name_to_save, "wb"))
    sensitvity, specificity, ppv, npv = calculate_metrics(conf_matrix)
    specificity_by_week_full_period.append(specificity)
    # Calculate sensitivity (true positive rate)
    sensitivity_by_week_full_period.append(sensitvity)
    norm_MCC_by_week_full_period.append(
        (matthews_corrcoef(y_test_full_period, y_pred) + 1) / 2
    )

    ppv_by_week_full_period.append(ppv)
    npv_by_week_full_period.append(npv)

195


In [89]:
ROC_by_week_full_period

[0.8239374262101535]