In [125]:
from pmdarima import auto_arima
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from statsmodels.tsa.arima.model import ARIMA
from datetime import timedelta


new_cases = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/jhu/new_cases.csv"

cases_df = pd.read_csv(new_cases)

india_df = pd.DataFrame()
india_df["new_cases"] = cases_df["India"][:-1]

usa_df = pd.DataFrame()
usa_df["new_cases"] = cases_df["United States"][:-1]

india_df.index = pd.to_datetime(cases_df["date"][:-1])

usa_df.index = pd.to_datetime(cases_df["date"][:-1])

india_df.dropna(inplace = True)

usa_df.dropna(inplace = True)

print(usa_df.tail())

            new_cases
date                 
2021-10-25   104829.0
2021-10-26    79971.0
2021-10-27    99694.0
2021-10-28    78128.0
2021-10-29    97361.0


In [123]:
def arima_predict(dfs, days_ahead):
    """
        A Function which takes an List of dataframes, and an integer between 1 to 7 for number of days for prediction.
        This is limited because if the number of days increases, the model may take alot of time to compute.
        Returns an List of arrays, consisting of the "days_ahead" values predictions of the given dataframes.
    """
    
    start_date = dfs[0].iloc[-1].name
    
    predictions = []
    
    for df in dfs:
        # Taking into consideration only recent cases to avoid overfitting
        df = df[-150:]
        
        warnings.filterwarnings("ignore")


        stepwise_fit = auto_arima(df["new_cases"], stepwise=True, trace = True, suppress_warnings = True)
        
        summary_string = str(stepwise_fit.summary())
        
        param = re.findall('SARIMAX\(([0-9]+), ([0-9]+), ([0-9]+)',summary_string)
        
        p,d,q = int(param[0][0]) , int(param[0][1]) , int(param[0][2])
        print(p,d,q)

        model=ARIMA(df['new_cases'],order=(p, d, q))
        model_fit=model.fit()

        model_fit.summary()
        
        pred = model_fit.predict(start = len(df), end = len(df) + days_ahead - 1, typ = 'levels')

        index_future_dates = pd.date_range(start = start_date + timedelta(days=1), end = start_date + timedelta(days=(days_ahead)))

        pred.index = index_future_dates 

        predictions.append([pred])
    return predictions

In [124]:
final_predictions = arima_predict([india_df, usa_df], 7)

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.56 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=3149.510, Time=0.01 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=3141.071, Time=0.05 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=3142.121, Time=0.06 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=3148.627, Time=0.01 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=3136.961, Time=0.07 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=3138.160, Time=0.09 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=3140.499, Time=0.16 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=3126.443, Time=0.27 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=3139.447, Time=0.13 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=3138.087, Time=0.08 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=3128.137, Time=0.10 sec

Best model:  ARIMA(1,1,1)(0,0,0)[0] intercept
Total fit time: 1.617 seconds
1 1 1
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=36