In [1]:
#Importing libraries
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from os import getcwd
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Embedding, Dropout
from sklearn import preprocessing
import datetime as dt
import io
from sagemaker import get_execution_role
import os
import boto3


In [2]:
role = get_execution_role()
input_bucket = 'hackathon-wwc'

# Dataset for COVID-19 cases in California by counties
cases_data = pd.read_csv('s3://hackathon-wwc/statewide_cases.csv')
# Dataset for hospitalized COVID-19 cases in California by counties
hospital_data = pd.read_csv('s3://hackathon-wwc/hospitals_by_county.csv')


In [3]:
cases_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8525 entries, 0 to 8524
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   county               8525 non-null   object 
 1   totalcountconfirmed  8522 non-null   float64
 2   totalcountdeaths     8523 non-null   float64
 3   newcountconfirmed    8525 non-null   int64  
 4   newcountdeaths       8525 non-null   int64  
 5   date                 8525 non-null   object 
dtypes: float64(2), int64(2), object(2)
memory usage: 399.7+ KB


In [4]:
hospital_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7389 entries, 0 to 7388
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   county                                 7389 non-null   object 
 1   todays_date                            7389 non-null   object 
 2   hospitalized_covid_confirmed_patients  7381 non-null   float64
 3   hospitalized_suspected_covid_patients  7381 non-null   float64
 4   hospitalized_covid_patients            6104 non-null   float64
 5   all_hospital_beds                      6014 non-null   float64
 6   icu_covid_confirmed_patients           7360 non-null   float64
 7   icu_suspected_covid_patients           7360 non-null   float64
 8   icu_available_beds                     6585 non-null   float64
dtypes: float64(7), object(2)
memory usage: 519.7+ KB


In [5]:
cases_data.head()

Unnamed: 0,county,totalcountconfirmed,totalcountdeaths,newcountconfirmed,newcountdeaths,date
0,Santa Clara,151.0,6.0,151,6,2020-03-18
1,Santa Clara,183.0,8.0,32,2,2020-03-19
2,Santa Clara,246.0,8.0,63,0,2020-03-20
3,Santa Clara,269.0,10.0,23,2,2020-03-21
4,Santa Clara,284.0,13.0,15,3,2020-03-22


In [6]:
hospital_data.head()

Unnamed: 0,county,todays_date,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
0,Tuolumne,2020-03-29,1.0,5.0,,,0.0,0.0,0.0
1,Siskiyou,2020-03-29,0.0,0.0,,,0.0,0.0,2.0
2,Calaveras,2020-03-29,,,,,,,
3,Stanislaus,2020-03-29,20.0,48.0,,,9.0,8.0,25.0
4,Riverside,2020-03-29,44.0,78.0,,,13.0,20.0,46.0


In [7]:
# Preparing the final dataset
data = pd.merge(cases_data, hospital_data, how = 'inner', left_on = ['county','date'], right_on = ['county','todays_date'])
data=data[['county','date','newcountconfirmed','newcountdeaths','hospitalized_covid_confirmed_patients','hospitalized_suspected_covid_patients','hospitalized_covid_patients','icu_covid_confirmed_patients','icu_suspected_covid_patients']]
data['date']=pd.to_datetime(data['date'])
data = data[data['date']>'2020-04-22']
data.fillna(0,inplace=True)
data.isnull().sum()

county                                   0
date                                     0
newcountconfirmed                        0
newcountdeaths                           0
hospitalized_covid_confirmed_patients    0
hospitalized_suspected_covid_patients    0
hospitalized_covid_patients              0
icu_covid_confirmed_patients             0
icu_suspected_covid_patients             0
dtype: int64

In [8]:
data.head()

Unnamed: 0,county,date,newcountconfirmed,newcountdeaths,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,icu_covid_confirmed_patients,icu_suspected_covid_patients
25,Santa Clara,2020-04-23,27,1,137.0,28.0,165.0,67.0,5.0
26,Santa Clara,2020-04-24,30,3,129.0,34.0,163.0,60.0,5.0
27,Santa Clara,2020-04-25,19,1,125.0,47.0,172.0,62.0,15.0
28,Santa Clara,2020-04-26,52,3,118.0,64.0,182.0,60.0,17.0
29,Santa Clara,2020-04-27,7,3,115.0,49.0,164.0,56.0,10.0


In [9]:
# Extracting the county names from dataset
county_name = data["county"].unique()
print(county_name)
print(len(county_name))

['Santa Clara' 'San Mateo' 'Santa Barbara' 'Tuolumne' 'Placer'
 'San Luis Obispo' 'Solano' 'Monterey' 'Yuba' 'San Francisco' 'Orange'
 'Mono' 'Calaveras' 'Nevada' 'Butte' 'Santa Cruz' 'Mendocino' 'Fresno'
 'Inyo' 'San Joaquin' 'Ventura' 'Trinity' 'Mariposa' 'El Dorado' 'Sonoma'
 'Shasta' 'Lassen' 'Colusa' 'Lake' 'Modoc' 'Tulare' 'San Benito' 'Alameda'
 'Marin' 'San Diego' 'Glenn' 'Siskiyou' 'Madera' 'Tehama' 'Sutter' 'Napa'
 'Los Angeles' 'Sacramento' 'Merced' 'Kings' 'Kern' 'Riverside'
 'Contra Costa' 'Del Norte' 'San Bernardino' 'Stanislaus' 'Humboldt'
 'Plumas' 'Amador' 'Imperial' 'Yolo']
56


In [10]:
# Model Parameters
time_steps = 30  # No. of historical data to be used for forecasting
batch_size = 1024  # No. of observations to be used at a time for training
no_of_days_to_predict = 3 # No. of days to forecasted

In [11]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [12]:
# Scaling the data
scaler = preprocessing.StandardScaler()
scaled_data = scaler.fit_transform(data[['newcountconfirmed', 'newcountdeaths', 'hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients','hospitalized_covid_patients', 'icu_covid_confirmed_patients', 'icu_suspected_covid_patients']])

data["newcountconfirmed"] = scaled_data[:, 0]
data["newcountdeaths"] = scaled_data[:, 1]
data["hospitalized_covid_confirmed_patients"] = scaled_data[:, 2]
data["hospitalized_suspected_covid_patients"] = scaled_data[:, 3]
data["hospitalized_covid_patients"] = scaled_data[:, 4]
data["icu_covid_confirmed_patients"] = scaled_data[:, 5]
data["icu_suspected_covid_patients"] = scaled_data[:, 6]


In [13]:
# Preparing dataset for training LSTM

X_train, y_train = [], []

for county in county_name:
    filtered_data = data.loc[data["county"] == county]
    filtered_data = filtered_data.sort_values(by='date',ascending=True)
    train_size = len(filtered_data)
    filtered_train = filtered_data.iloc[0:train_size]
  
    filtered_X_train, filtered_y_train = create_dataset(filtered_train[['newcountconfirmed', 'newcountdeaths', 'hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients', 'hospitalized_covid_patients','icu_covid_confirmed_patients', 'icu_suspected_covid_patients']], 
                                                      filtered_train[['hospitalized_covid_patients', 'icu_covid_confirmed_patients']], time_steps)

    if(len(X_train) == 0):
        X_train, y_train = filtered_X_train, filtered_y_train
    else:
        X_train = np.vstack((X_train, filtered_X_train))
        y_train = np.vstack((y_train, filtered_y_train))

print(X_train.shape, y_train.shape)

(4312, 30, 7) (4312, 2)


In [15]:
#Defining LSTM Model

model = Sequential()
model.add(LSTM(batch_size, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.1))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dense(2))

model.summary()

model.compile(loss = 'mean_squared_error', optimizer = 'rmsprop', metrics = ['mse', 'accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 1024)          4227072   
_________________________________________________________________
dropout (Dropout)            (None, 30, 1024)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 512)           3147776   
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 512)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 256)           787456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 256)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)               

In [16]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 5746171629440068154,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 15195801934191434902
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 14386136619623368562
 physical_device_desc: "device: XLA_GPU device",
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11329617920
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 12535539517677464683
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7"]

In [18]:
# Training the model for 1000 epochs. Skip to next step for already trained model.

from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("best_model.hdf5", monitor='accuracy', verbose=1,
    save_best_only=True, mode='auto', period=1)

history = model.fit(
    X_train, y_train,
    epochs = 1000,
    batch_size = batch_size,
    callbacks=[checkpoint],
    verbose=1
)

Train on 4312 samples
Epoch 1/1000
Epoch 00001: accuracy improved from -inf to 0.50232, saving model to best_model.hdf5
Epoch 2/1000
Epoch 00002: accuracy improved from 0.50232 to 0.53340, saving model to best_model.hdf5
Epoch 3/1000
Epoch 00003: accuracy improved from 0.53340 to 0.55798, saving model to best_model.hdf5
Epoch 4/1000
Epoch 00004: accuracy did not improve from 0.55798
Epoch 5/1000
Epoch 00005: accuracy improved from 0.55798 to 0.56656, saving model to best_model.hdf5
Epoch 6/1000
Epoch 00006: accuracy did not improve from 0.56656
Epoch 7/1000
Epoch 00007: accuracy improved from 0.56656 to 0.57537, saving model to best_model.hdf5
Epoch 8/1000
Epoch 00008: accuracy improved from 0.57537 to 0.61410, saving model to best_model.hdf5
Epoch 9/1000
Epoch 00009: accuracy improved from 0.61410 to 0.64100, saving model to best_model.hdf5
Epoch 10/1000
Epoch 00010: accuracy did not improve from 0.64100
Epoch 11/1000
Epoch 00011: accuracy did not improve from 0.64100
Epoch 12/1000
Ep

In [17]:
# Loading previously trained model and making the forecasts

model = keras.models.load_model("best_model.hdf5")

forecasted_data = pd.DataFrame()

for county in county_name:
    demo_data = data[data['county'] == county]
    demo_data = demo_data[['newcountconfirmed', 'newcountdeaths', 'hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients', 'hospitalized_covid_patients','icu_covid_confirmed_patients', 'icu_suspected_covid_patients']]
    demo_data = demo_data.tail(time_steps + 1)

    for day in range(no_of_days_to_predict):
        demo_X_test, demo_y_test = create_dataset(demo_data[['newcountconfirmed', 'newcountdeaths', 'hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients', 'hospitalized_covid_patients','icu_covid_confirmed_patients', 'icu_suspected_covid_patients']].tail(time_steps + 1), 
                                                        demo_data[['hospitalized_covid_patients', 'icu_covid_confirmed_patients']].tail(time_steps + 1), time_steps)
    
        predictions = np.concatenate((model.predict(demo_X_test), np.zeros((1, 5))), axis = 1)
        #print(predictions)
        predictions[0,2] = np.array(demo_data['newcountconfirmed'])[0]
        predictions[0,3] = np.array(demo_data['newcountdeaths'])[0]
        predictions[0,4] = np.array(demo_data['hospitalized_covid_confirmed_patients'])[0]
        predictions[0,5] = np.array(demo_data['hospitalized_suspected_covid_patients'])[0]
        predictions[0,6] = np.array(demo_data['icu_suspected_covid_patients'])[0]

        demo_data = pd.DataFrame(np.concatenate((demo_data, predictions), axis = 0))
        demo_data.columns = ['newcountconfirmed', 'newcountdeaths', 'hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients','hospitalized_covid_patients', 'icu_covid_confirmed_patients', 'icu_suspected_covid_patients']

    transformed_demo_data = np.intc(scaler.inverse_transform(demo_data))
    demo_data['newcountconfirmed'] = np.ceil(np.abs(transformed_demo_data[:, 0]))
    demo_data['newcountdeaths'] = np.ceil(np.abs(transformed_demo_data[:, 1]))
    demo_data['hospitalized_covid_confirmed_patients'] = np.ceil(np.abs(transformed_demo_data[:, 2]))
    demo_data['hospitalized_suspected_covid_patients'] = transformed_demo_data[:, 3]
    demo_data['hospitalized_covid_patients'] = np.ceil(np.abs(transformed_demo_data[:, 4]))
    demo_data['icu_covid_confirmed_patients'] = np.ceil(np.abs(transformed_demo_data[:, 5]))
    demo_data['icu_suspected_covid_patients'] = transformed_demo_data[:, 6]
    
    demo_data['county'] =  county
    demo_data = demo_data[['county','hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients', 'hospitalized_covid_patients','icu_covid_confirmed_patients', 'icu_suspected_covid_patients']]
    date_data = data[['date']]
    date_data = date_data.tail(time_steps + 4)
    date_data['date'] = pd.to_datetime(date_data['date']).apply(pd.DateOffset(3))
    
  
    forecasted_data = pd.concat([forecasted_data, pd.DataFrame(demo_data.iloc[[time_steps + 1]])])

In [18]:
# Displaying the forecasted data after 3 days from the last day in the dataset
forecasted_data.to_csv("forecast.csv")
forecasted_data

Unnamed: 0,county,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,icu_covid_confirmed_patients,icu_suspected_covid_patients
31,Santa Clara,68.0,6,140.0,18.0,3
31,San Mateo,42.0,6,66.0,10.0,1
31,Santa Barbara,109.0,-6,98.0,12.0,1
31,Tuolumne,8.0,6,2.0,3.0,0
31,Placer,22.0,6,36.0,28.0,1
31,San Luis Obispo,32.0,18,18.0,5.0,0
31,Solano,45.0,6,48.0,9.0,4
31,Monterey,20.0,6,56.0,3.0,0
31,Yuba,12.0,6,26.0,2.0,0
31,San Francisco,48.0,6,95.0,16.0,3


In [19]:
date=max(date_data['date'])

In [20]:
!pip install plotly-geo
!pip install geopandas==0.3.0
!pip install pyshp==1.2.10
!pip install shapely==1.6.3

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m


In [21]:
import plotly.figure_factory as ff

In [22]:
fips_data = pd.read_csv('county.csv')
available_hospital_data = pd.read_csv('available_hospitals_by_county.csv')

In [23]:
df = pd.merge(forecasted_data, fips_data, how = 'inner', on = ['county'])
df = pd.merge(df, available_hospital_data, how = 'inner', on = ['county'])

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 0 to 55
Data columns (total 10 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   county                                 56 non-null     object 
 1   hospitalized_covid_confirmed_patients  56 non-null     float64
 2   hospitalized_suspected_covid_patients  56 non-null     int32  
 3   hospitalized_covid_patients            56 non-null     float64
 4   icu_covid_confirmed_patients           56 non-null     float64
 5   icu_suspected_covid_patients           56 non-null     int32  
 6   fips                                   56 non-null     int64  
 7   todays_date                            56 non-null     object 
 8   all_hospital_beds                      56 non-null     int64  
 9   icu_available_beds                     56 non-null     int64  
dtypes: float64(3), int32(2), int64(3), object(2)
memory usage: 4.4+ KB


In [30]:
def visualize(values, fips, date, title):
    colorscale =colorscale = [
        'rgb(193, 193, 193)',
        'rgb(239,239,239)',
        'rgb(195, 196, 222)',
        'rgb(144,148,194)',
        'rgb(101,104,168)',
        'rgb(65, 53, 132)'
    ]


    endpts = list(np.linspace(-50, 50, len(colorscale) - 1))

    fig = ff.create_choropleth(
        fips=fips, values=values, colorscale=colorscale, show_state_data=True, 
        scope=["CA"], 
        binning_endpoints=endpts, 
        county_outline={'color': 'rgb(15, 15, 55)', 'width': 0.5}, 
        state_outline={'color': 'rgb(15, 15, 55)', 'width': 1}, 
        show_hover=True,
        plot_bgcolor='rgb(229,229,229)', 
        paper_bgcolor='rgb(229,229,229)',
        legend_title='No of beds available', 
        title=title+' in CA as of ' + date.strftime("%b %d %Y")
    )

    fig_counties_info = [t for t in fig['data'] if t.text][0]
    x_pts = list(fig_counties_info['x'])
    y_pts = list(fig_counties_info['y'])

    new_x_pts = []
    for coordinate in x_pts:
        if isinstance(coordinate,list):
            new_coordinate = float(coordinate[0])
        else:
            new_coordinate = coordinate
            new_x_pts.append(new_coordinate)

    new_y_pts = []
    for coordinate in y_pts:
        if isinstance(coordinate,list):
            new_coordinate = float(coordinate[0])
        else:
            new_coordinate = coordinate
            new_y_pts.append(new_coordinate)

    for t in fig['data']:
        if t.text:
            t['x'] = new_x_pts
            t['y'] = new_y_pts

    #pyo.plot(fig)

    fig.show()

In [31]:
values = (df['all_hospital_beds']-df['hospitalized_covid_patients']).tolist()
fips = df['fips'].tolist()

visualize(values, fips, date, 'Available Hospital Beds')

In [32]:
values = (df['icu_available_beds']-(df['icu_covid_confirmed_patients']+ df['icu_suspected_covid_patients'])).tolist()
fips = df['fips'].tolist()

visualize(values, fips, date, 'Available ICU beds')