In [None]:
import fbprophet as pr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
def get_covid_data():
    
    #get the latest data from OxCGRT
    DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
    full_df = pd.read_csv(DATA_URL,
                    parse_dates=['Date'],
                    encoding="ISO-8859-1",
                    dtype={"RegionName": str},
                    error_bad_lines=False)

    #add new cases and new deaths columns

    for state in full_df[(full_df['Jurisdiction'] == 'STATE_TOTAL')]['RegionName'].unique():
        state_inds = (full_df['Jurisdiction'] == 'STATE_TOTAL') & (full_df['RegionName'] == state)
        full_df.loc[state_inds, 'NewCases'] = full_df.loc[state_inds, 'ConfirmedCases'].diff().fillna(0)
        full_df.loc[state_inds, 'NewDeaths'] = full_df.loc[state_inds, 'ConfirmedDeaths'].diff().fillna(0)

    for country in full_df[(full_df['Jurisdiction'] == 'NAT_TOTAL')]['CountryName'].unique():
        nat_inds = (full_df['Jurisdiction'] == 'NAT_TOTAL') & (full_df['CountryName'] == country)
        full_df.loc[nat_inds, 'NewCases'] = full_df.loc[nat_inds, 'ConfirmedCases'].diff().fillna(0)
        full_df.loc[nat_inds, 'NewDeaths'] = full_df.loc[nat_inds, 'ConfirmedDeaths'].diff().fillna(0)

    return full_df

def get_region(df, division, region):
    if division == 'state':
        df = full_df[(full_df['Jurisdiction'] == 'STATE_TOTAL') & (full_df['RegionName'] == region)][:-1]
    elif division == 'country':
        df = full_df[(full_df['Jurisdiction'] == 'NAT_TOTAL') & (full_df['CountryName'] == region)][:-1]
    else: 
        print('Please specify "state" or "country" division')
    nans = df.isna().sum()
    columns = nans[nans <= 21].index
    df = df[columns]
    df.loc[df.Date == '2020-01-01', ['ConfirmedCases','ConfirmedDeaths']] = 0
    df = df.fillna(method = 'ffill')
    return df

def mean_percent_error(y_test, y_hat):
    error = np.abs(y_test - y_hat)
    percent_error = error/y_test
    mean_percent_error = percent_error.sum() / len(y_test)
    return mean_percent_error

In [None]:
def get_simple_covid_data():
    """
    Download latest confirmed cases and deaths from Oxford by both states and countries.
    Create new cases and new deaths columns as the running difference in the confirmed cases and deaths, which are cumulative.
    return resulting dataframe
    """
    #download latest data from Oxford
    DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
    df = pd.read_csv(DATA_URL,
                    parse_dates=['Date'],
                    encoding="ISO-8859-1",
                    dtype={"RegionName": str},
                    error_bad_lines=False,
                    usecols = ['Date','Jurisdiction','RegionName','CountryName','ConfirmedCases','ConfirmedDeaths'])
 
    #forward fill NaNs in confirmed cases and confirmed deaths columns
    #if January 1st is NaN, set to 0
    df.loc[(df.Date == '2020-01-01') & (df['ConfirmedCases'].isna()), 'ConfirmedCases'] = 0
    df.loc[(df.Date == '2020-01-01') & (df['ConfirmedDeaths'].isna()), 'ConfirmedDeaths'] = 0
    df[['ConfirmedCases','ConfirmedDeaths']] = df[['ConfirmedCases','ConfirmedDeaths']].fillna(method = 'ffill')

    #add new cases and new deaths columns
    for state in df[(df['Jurisdiction'] == 'STATE_TOTAL')]['RegionName'].unique():
        state_inds = (df['Jurisdiction'] == 'STATE_TOTAL') & (df['RegionName'] == state)
        df.loc[state_inds, 'NewCases'] = df.loc[state_inds, 'ConfirmedCases'].diff().fillna(0)
        df.loc[state_inds, 'NewDeaths'] = df.loc[state_inds, 'ConfirmedDeaths'].diff().fillna(0)

    for country in df[(df['Jurisdiction'] == 'NAT_TOTAL')]['CountryName'].unique():
        nat_inds = (df['Jurisdiction'] == 'NAT_TOTAL') & (df['CountryName'] == country)
        df.loc[nat_inds, 'NewCases'] = df.loc[nat_inds, 'ConfirmedCases'].diff().fillna(0)
        df.loc[nat_inds, 'NewDeaths'] = df.loc[nat_inds, 'ConfirmedDeaths'].diff().fillna(0)
        
    return df

In [None]:
df = get_simple_covid_data()
df

In [None]:
import pandas as pd
from fbprophet import Prophet
import os

def get_simple_covid_data():
    """
    Download latest confirmed cases and deaths from Oxford by both states and countries.
    Create new cases and new deaths columns as the running difference in the confirmed cases and deaths, which are cumulative.
    return resulting dataframe
    """
    #download latest data from Oxford
    DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
    df = pd.read_csv(DATA_URL,
                    parse_dates=['Date'],
                    encoding="ISO-8859-1",
                    dtype={"RegionName": str},
                    error_bad_lines=False,
                    usecols = ['Date','Jurisdiction','RegionName','CountryName','ConfirmedCases','ConfirmedDeaths'])
 
    #forward fill NaNs in confirmed cases and confirmed deaths columns
    #if January 1st is NaN, set to 0
    df.loc[(df.Date == '2020-01-01') & (df['ConfirmedCases'].isna()), 'ConfirmedCases'] = 0
    df.loc[(df.Date == '2020-01-01') & (df['ConfirmedDeaths'].isna()), 'ConfirmedDeaths'] = 0
    df[['ConfirmedCases','ConfirmedDeaths']] = df[['ConfirmedCases','ConfirmedDeaths']].fillna(method = 'ffill')

    #add new cases and new deaths columns
    for state in df[(df['Jurisdiction'] == 'STATE_TOTAL')]['RegionName'].unique():
        state_inds = (df['Jurisdiction'] == 'STATE_TOTAL') & (df['RegionName'] == state)
        df.loc[state_inds, 'NewCases'] = df.loc[state_inds, 'ConfirmedCases'].diff().fillna(0)
        df.loc[state_inds, 'NewDeaths'] = df.loc[state_inds, 'ConfirmedDeaths'].diff().fillna(0)

    for country in df[(df['Jurisdiction'] == 'NAT_TOTAL')]['CountryName'].unique():
        nat_inds = (df['Jurisdiction'] == 'NAT_TOTAL') & (df['CountryName'] == country)
        df.loc[nat_inds, 'NewCases'] = df.loc[nat_inds, 'ConfirmedCases'].diff().fillna(0)
        df.loc[nat_inds, 'NewDeaths'] = df.loc[nat_inds, 'ConfirmedDeaths'].diff().fillna(0)
        
    return df

def predict(country='United States', state = None, days_ahead=30, predict='cases', output_folder = None,
            rolling_mean = True):
    
    #retrieve latest covif data
    df = get_simple_covid_data()
    
    #subset df by country and state.  Defaults if no regional info is passed is all of United States
    df = df[df['CountryName'] == country]
    if state:
        df = df[(df['CountryName'] == country)
        & (df['Jurisdiction'] == 'STATE_TOTAL')
        & (df['RegionName'] == state)]
    else: 
        df = df[(df['Jurisdiction'] == 'NAT_TOTAL') & (df['CountryName'] == country)]
    
    if predict == 'deaths':
        df = df[['Date','NewDeaths']].rename(columns = {'Date':'ds','NewDeaths':'y'})
    else:
        df = df[['Date','NewCases']].rename(columns = {'Date':'ds','NewCases':'y'})

    #create forecast using Facebook Prophet
    m = Prophet(seasonality_mode = 'multiplicative',
                yearly_seasonality = False,
                daily_seasonality = False,
                weekly_seasonality = True)
    m.add_country_holidays(country_name='US')
    m.fit(df)
    future = m.make_future_dataframe(periods=days_ahead)
    forecast = m.predict(future)[['ds','yhat']].tail(days_ahead)

    #save forecast as JSON
    os.makedirs(os.path.dirname(output_folder), exist_ok=True)
    output_file_path = os.path.join(output_folder,'prediction.json')
    if rolling_mean:
        forecast['yhat'] = forecast['yhat'].rolling(window=7).mean()
    forecast.to_json(output_file_path, orient='table', index=False)

    #create graph of forecasted cases
    fig = forecast.plot(x='ds', y='yhat', ylim = (0,forecast['yhat'].max()*1.1),
                        title = f'Predicted {days_ahead} Day Rolling Average',
                        xlabel = 'Date', ylabel = predict.title())
    output_image_path = os.path.join(output_folder,'prediction_graph.png')

    plt.savefig(output_image_path)
    return forecast

In [None]:
forecast = predict(
                   output_folder = '/content')
forecast