In [4]:
from pmdarima import auto_arima
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from statsmodels.tsa.arima.model import ARIMA
from datetime import timedelta


new_cases = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/jhu/new_cases.csv"

cases_df = pd.read_csv(new_cases)

india_df = pd.DataFrame()
india_df["new_cases"] = cases_df["India"][:-1]

usa_df = pd.DataFrame()
usa_df["new_cases"] = cases_df["United States"][:-1]

india_df.index = pd.to_datetime(cases_df["date"][:-1])

usa_df.index = pd.to_datetime(cases_df["date"][:-1])

india_df.dropna(inplace = True)

usa_df.dropna(inplace = True)

print(usa_df.tail())

            new_cases
date                 
2021-10-25   104829.0
2021-10-26    79971.0
2021-10-27    99694.0
2021-10-28    78128.0
2021-10-29    97361.0


In [5]:
def arima_predict(dfs, days_ahead):
    """
        A Function which takes an List of dataframes, and an integer between 1 to 7 for number of days for prediction.
        This is limited because if the number of days increases, the model may take alot of time to compute.
        Returns an List of arrays, consisting of the "days_ahead" values predictions of the given dataframes.
    """
    
    start_date = dfs[0].iloc[-1].name
    
    predictions = []
    
    for df in dfs:
        # Taking into consideration only recent cases to avoid overfitting
        df = df[-125:]
        
        warnings.filterwarnings("ignore")


        stepwise_fit = auto_arima(df["new_cases"], stepwise=True, trace = True, suppress_warnings = True)
        
        summary_string = str(stepwise_fit.summary())
        
        param = re.findall('SARIMAX\(([0-9]+), ([0-9]+), ([0-9]+)',summary_string)
        
        p,d,q = int(param[0][0]) , int(param[0][1]) , int(param[0][2])

        model=ARIMA(df['new_cases'],order=(p, d, q))
        model_fit=model.fit()

        model_fit.summary()
        
        pred = model_fit.predict(start = len(df), end = len(df) + days_ahead - 1, typ = 'levels')

        index_future_dates = pd.date_range(start = start_date + timedelta(days=1), end = start_date + timedelta(days=(days_ahead)))

        pred.index = index_future_dates 

        predictions.append([pred])
    return predictions

In [6]:
final_predictions = arima_predict([india_df, usa_df], 7)

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=1.90 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=2633.276, Time=0.02 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=2604.080, Time=0.04 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=2574.745, Time=0.22 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=2631.366, Time=0.17 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=2576.735, Time=0.22 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=2576.775, Time=0.20 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.38 sec
 ARIMA(0,1,1)(0,0,0)[0]             : AIC=2574.560, Time=0.03 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=2576.554, Time=0.04 sec
 ARIMA(0,1,2)(0,0,0)[0]             : AIC=2576.558, Time=0.05 sec
 ARIMA(1,1,0)(0,0,0)[0]             : AIC=2602.134, Time=0.03 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=2578.634, Time=0.14 sec

Best model:  ARIMA(0,1,1)(0,0,0)[0]          
Total fit time: 3.946 seconds
Performing stepwise search to mi

In [7]:
print(final_predictions)

[[2021-10-30    14726.921547
2021-10-31    14726.921547
2021-11-01    14726.921547
2021-11-02    14726.921547
2021-11-03    14726.921547
2021-11-04    14726.921547
2021-11-05    14726.921547
Freq: D, Name: predicted_mean, dtype: float64], [2021-10-30    39797.915035
2021-10-31    59442.073438
2021-11-01    81822.578575
2021-11-02    61670.503876
2021-11-03    56318.349202
2021-11-04    83188.244517
2021-11-05    78781.265788
Freq: D, Name: predicted_mean, dtype: float64]]


In [11]:
final_predictions[0][0]

2021-10-30    14726.921547
2021-10-31    14726.921547
2021-11-01    14726.921547
2021-11-02    14726.921547
2021-11-03    14726.921547
2021-11-04    14726.921547
2021-11-05    14726.921547
Freq: D, Name: predicted_mean, dtype: float64