In [1]:
import sys
sys.path.append('../.')

import datetime
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc

from lib import get_data
from lib import simplified_model

rc('text', usetex=False)

../lib
Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,16 hours 29 mins
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.1.2
H2O cluster version age:,1 month and 16 days
H2O cluster name:,H2O_from_python_tcai_1f2d4e
H2O cluster total nodes:,1
H2O cluster free memory:,15.97 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [2]:
def prepare_model_data(date_range, pred_day, outcome):
    df = get_data.get_model_data(date_range=date_range, pred_day=pred_day)
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna()  # todo check effect of imputation

    # todo feature engineering ideas
    # longitude and latitude

    df_x = simplified_model.multicollinearity_check(
        df.drop([f'day_{pred_day}_delta_cases', f'day_{pred_day}_delta_deaths', 'state', 'county', 'fips'], axis=1))
    df_x = df_x.rename({'deaths': 'past_deaths', 'cases': 'past_cases'}, axis=1)

    df_y = df[[f'day_{pred_day}_delta_{outcome}']]
    df_y = df_y.rename({f'day_{pred_day}_delta_{outcome}': outcome}, axis=1)
    df_model = pd.concat([df_x, df_y], axis=1)

    # todo add pred day End-of-Period metrics
    return df_model

In [3]:
model_start_time = datetime.datetime.now().strftime('%m_%d_%H_%M')  # as a flag to track separate model results

periods = 30  # total number of days to try in each period
prediction_period = 3
output_df = pd.DataFrame()

# model by start days to show changing predictability over time
for training_range in tqdm([7]):  # range of dates used for training data, from 4 days of training to 20 days
    mae = {'cases': [], 'deaths': []}  # todo outcome try actual n cases
    for day in range(periods):
        date_range = (day, day + training_range)  # interval controls the length of data collected
        y_day = day + training_range + prediction_period  # currently only predict results from n days out
        print('---', date_range, y_day)

        for y in ['cases', 'deaths']:
            # outcome = f'day_{y_day}_delta_{y}'
            tmp_df = prepare_model_data(date_range, y_day, y)

            if len(tmp_df) <= 200:
                continue

            tmp_output_df = simplified_model.linear(prepare_model_data(date_range, y_day, y),
                                                    outcome=y, family='gaussian', link='identity', seed=1,
                                                    model_name=f'{date_range[0]}_{date_range[1]}_{y_day}',
                                                    suffix=model_start_time)

            try:
                mae[y].append(list(set(tmp_output_df['mae']))[0])
                tmp_output_df['start_date'] = date_range[0]
                tmp_output_df['end_date'] = date_range[1]
                tmp_output_df['interval'] = training_range
                tmp_output_df['pred_date'] = y_day
                tmp_output_df['outcome_name'] = y
                output_df = output_df.append(tmp_output_df)
            except TypeError:
                print('--- No longer predictive')
                break

  0%|          | 0/1 [00:00<?, ?it/s]

--- (0, 7) 10
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
--- (1, 8) 11


  0%|          | 0/1 [00:22<?, ?it/s]
ERROR:concurrent.futures:exception calling callback for <Future at 0x11eaa83d0 state=finished returned list>
Traceback (most recent call last):
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 340, in __call__
    self.parallel.dispatch_next()
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 769, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 835, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 754, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/U

ERROR:concurrent.futures:exception calling callback for <Future at 0x11ea89110 state=finished raised BrokenProcessPool>
Traceback (most recent call last):
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 340, in __call__
    self.parallel.dispatch_next()
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 769, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 835, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tcai/miniconda3/envs/covid19/lib/python3.7/site-packages/joblib/parallel.py", line 754, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tcai/miniconda3/envs/c

KeyboardInterrupt: 

In [6]:
output_df.to_csv(f'../reports/model_coef_{model_start_time}.csv', index=False)

In [None]:
# coefficient plots
for y in ['deaths', 'cases']:
    tmp_output_df = output_df[output_df['outcome_name'] == y]
    n_variables = len(set(tmp_output_df['names']))
    fig, axes = plt.subplots(int(np.ceil(n_variables / 4)), 4, figsize=(16, 10), sharex='all')

    i = 0
    for c in set(tmp_output_df['names']):
        axes[i // 4, i % 4].plot(tmp_output_df[tmp_output_df['names'] == c]['start_date'],
                                 tmp_output_df[tmp_output_df['names'] == c]['coefficients'])
        axes[i // 4, i % 4].set_title(f'{c}')
        axes[i // 4, i % 4].set_ylabel('Coefficient')
        axes[i // 4, i % 4].set_xlabel('Day')
        i += 1

    plt.show()

In [None]:
# performance plots
for metric in ['r2', 'mae']:
    performance_df = output_df[['start_date', 'r2', 'mae', 'outcome_name']].drop_duplicates()
    for y in ['deaths', 'cases']:
        tmp_performance_df = performance_df[performance_df['outcome_name'] == y]
        plt.plot(tmp_performance_df['start_date'], tmp_performance_df[metric])
        plt.title('Metric by Day')
        plt.xlabel('Day')
        plt.ylabel(metric)
        plt.show()

In [None]:
# try gam
fig = simplified_model.linear_gam(prepare_model_data((0, 7), 14, 'deaths'), 'deaths', seed=1)
plt.show()