## imports

In [3]:
import pandas
import numpy
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
import mysql.connector
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
import joblib
import datetime
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.vector_ar.var_model import VAR

%matplotlib qt

## Database Connection and data Fetching

In [3]:
with open('../config.env') as f:
    credentials=f.read()
credentials=credentials.split(" ")
con = mysql.connector.connect(username=credentials[0],password=credentials[1],host=credentials[2],port=credentials[3],database=credentials[4])
query=con.cursor()

In [4]:
sql="SELECT * FROM total_india_cases"
query.execute(sql)
result=query.fetchall()
column=[columns[0] for columns in query.description]
india_data=pandas.DataFrame(result,columns=column)

In [5]:
india_data

Unnamed: 0,index_no,date,ordinal_date,total_confirmed,total_active,total_recovered,total_deaths,total_tested,total_vaccinated1,total_vaccinated2,...,delta7_confirmed,delta7_active,delta7_recovered,delta7_deaths,delta7_tested,delta7_vaccinated1,delta7_vaccinated2,total_other,delta_other,delta7_other
0,1,2020-01-30,737454,1,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,2,2020-02-02,737457,2,2,0,0,0,0,0,...,2,2,0,0,0,0,0,0,0,0
2,3,2020-02-03,737458,3,3,0,0,0,0,0,...,3,3,0,0,0,0,0,0,0,0
3,4,2020-02-14,737469,3,0,3,0,0,0,0,...,0,-3,3,0,0,0,0,0,0,0
4,5,2020-03-02,737486,5,2,3,0,0,0,0,...,2,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608,610,2021-10-27,738090,34231243,154866,33606777,456418,604498405,723497151,317002722,...,104536,-17668,118586,3574,8756187,17828127,24133671,13182,10,44
609,611,2021-10-28,738091,34245550,155174,33619966,457223,605885769,726445742,321755224,...,103069,-14258,113133,4147,8819288,17058938,25237448,13187,5,47
610,612,2021-10-29,738092,34259765,155281,33633515,457774,607062619,728940103,325373874,...,100957,-12163,109046,4032,8631457,16526747,24758819,13195,8,42
611,613,2021-10-30,738093,34272705,153103,33648187,458219,608319915,731621098,329819237,...,97818,-13347,107209,3918,8548595,15887789,24509288,13196,1,38


In [6]:
sql="SELECT * FROM total_state_cases WHERE state_name='MH'"
query.execute(sql)
result=query.fetchall()
column=[columns[0] for columns in query.description]
maharashtra_data=pandas.DataFrame(result,columns=column)

In [7]:
maharashtra_data

Unnamed: 0,index_no,date,ordinal_date,state_name,total_confirmed,total_active,total_recovered,total_deaths,total_tested,total_vaccinated1,...,delta7_confirmed,delta7_active,delta7_recovered,delta7_deaths,delta7_tested,delta7_vaccinated1,delta7_vaccinated2,total_other,delta_other,delta7_other
0,52,2020-03-09,737493,MH,2,2,0,0,0,0,...,2,2,0,0,0,0,0,0,0,0
1,64,2020-03-10,737494,MH,5,5,0,0,0,0,...,5,5,0,0,0,0,0,0,0,0
2,76,2020-03-11,737495,MH,11,11,0,0,0,0,...,11,11,0,0,0,0,0,0,0,0
3,89,2020-03-12,737496,MH,14,14,0,0,0,0,...,14,14,0,0,0,0,0,0,0,0
4,102,2020-03-13,737497,MH,17,17,0,0,0,0,...,17,17,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,20925,2021-10-27,738090,MH,6606536,19480,6443342,140098,62202811,66397746,...,9891,-6248,15916,212,832421,2010496,1534138,3616,1,11
598,20961,2021-10-28,738091,MH,6607954,18748,6445454,140134,62316910,66658668,...,9736,-5544,15060,209,822820,1731722,1480512,3618,2,11
599,20997,2021-10-29,738092,MH,6609292,18465,6447038,140170,62439900,66908849,...,9442,-5673,14900,205,813601,1677329,1399884,3619,1,10
600,21033,2021-10-30,738093,MH,6609906,16905,6449186,140196,62559171,67145633,...,8355,-7117,15267,198,796208,1625770,1298054,3619,0,7


In [8]:
def get_district_data(districtname='Raigad'):
    sql="SELECT * FROM total_district_cases WHERE district_name='{}'".format(districtname)
    query.execute(sql)
    result=query.fetchall()
    column=[columns[0] for columns in query.description]
    district_data=pandas.DataFrame(result,columns=column)
    return district_data

raigad_data=get_district_data()
pune_data=get_district_data('Pune')
thane_data=get_district_data('Thane')
mumbai_data=get_district_data('Mumbai')

In [9]:
raigad_data

Unnamed: 0,index_no,date,ordinal_date,state_name,district_name,total_confirmed,total_active,total_recovered,total_deaths,delta_confirmed,...,delta7_deaths,total_vaccinated1,total_vaccinated2,delta_vaccinated1,delta_vaccinated2,delta7_vaccinated1,delta7_vaccinated2,total_other,delta_other,delta7_other
0,19918,2020-04-26,737541,MH,Raigad,57,36,20,1,0,...,0,0,0,0,0,0,0,0,0,0
1,20334,2020-04-27,737542,MH,Raigad,61,39,21,1,4,...,0,0,0,0,0,0,0,0,0,0
2,20759,2020-04-28,737543,MH,Raigad,66,44,21,1,5,...,0,0,0,0,0,0,0,0,0,0
3,21189,2020-04-29,737544,MH,Raigad,69,44,23,2,3,...,1,0,0,0,0,0,0,0,0,0
4,21623,2020-04-30,737545,MH,Raigad,71,44,24,3,2,...,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,400570,2021-10-27,738090,MH,Raigad,195812,639,190620,4546,39,...,8,1850609,758887,3536,4449,67903,36282,7,0,0
550,401318,2021-10-28,738091,MH,Raigad,195860,659,190648,4546,48,...,5,1853351,763345,2742,4458,28042,35389,7,0,0
551,402066,2021-10-29,738092,MH,Raigad,195919,662,190703,4547,59,...,6,1856154,767986,2803,4641,24312,33312,7,0,0
552,402814,2021-10-30,738093,MH,Raigad,195932,649,190725,4551,13,...,10,1859181,773699,3027,5713,22326,32055,7,0,0


## function for traintest_splitting and next days ordinal_date creating

In [10]:
def splitting(data,training_ratio):
    print('Training data percentage = {}%'.format(training_ratio*100))
    length=len(data)
    traing_indexes=int(length*training_ratio)
    training_set=data[:traing_indexes]
    testing_set=data[traing_indexes:]
    print('rows in training data : {}     rows in testing data : {}'.format(len(training_set),len(testing_set)))
    return training_set,testing_set

In [117]:
number_of_next_days=7
def returnNextdays():
    date=738094
    date_range=[]
    l=[]
    for i in range(1,number_of_next_days+1):
        l.append([date+i])
        date_range.append(datetime.date.fromordinal(date+i))
    return l,date_range
predictingdate,daterange=returnNextdays()


In [118]:
datetime.date.fromordinal(738094)

datetime.date(2021, 10, 31)

## Train Test Splitting

In [None]:
# maindata=india_data


# maindata=maharashtra_data


maindata=raigad_data
# maindata=pune_data
# maindata=thane_data
# maindata=mumbai_data

In [None]:
try_data=numpy.concatenate((maindata['ordinal_date'].to_numpy(),predictingdate),axis=None)

In [None]:
maindata

In [None]:
type_of_case='delta_confirmed'
# type_of_case='delta_recovered'
# type_of_case='delta_deaths'
# type_of_case='delta_active'
# type_of_case='total_active'


name_of_case='daily confirmed'
# name_of_case='daily recovered'
# name_of_case='daily deaths'
# name_of_case='daily change in Active'
# name_of_case='Active '

maindata_labels=maindata[type_of_case]

In [None]:
train_data,test_data=splitting(maindata,0.95)

In [None]:
train_labels=train_data[type_of_case]
# train_labels=train_data['total_confirmed']

In [None]:
train_features=train_data[['ordinal_date']]
# train_fetures=train_data[['ordinal_date','delta_tested']]

In [None]:
train_features

In [None]:
train_labels

In [None]:
test_labels=test_data[type_of_case]
# test_labels=test_data['total_confirmed']

In [None]:
test_features=test_data[['ordinal_date']]
# test_features=test_data[['ordinal_date','delta_tested']]

In [None]:
# test_features

In [None]:
# test_labels

In [None]:
# print(predictingdate)
# print(daterange)

In [None]:
# maindata[['date','delta_deaths']][540:]

## Model Implementation 
1. Linear Regression
2. Random Forest Regression
3. Decision Tree Regression

In [None]:
linear_regression_model=LinearRegression()
random_forest_model=RandomForestRegressor()
decision_tree_model=DecisionTreeRegressor()

#### Linear regression

In [None]:
linear_regression_model.fit(train_features,train_labels)

In [None]:
linear_regression_model_train_predicted=linear_regression_model.predict(train_features)

In [None]:
# linear_regression_model_train_predicted

In [None]:
linear_regression_model_test_predicted=linear_regression_model.predict(test_features)

In [None]:
# linear_regression_model_test_predicted

In [None]:
linear_regression_model_next_days_predicted=linear_regression_model.predict(predictingdate)

In [None]:
# linear_regression_model_next_days_predicted

In [None]:
pyplot.figure('linear regression')
line_of_regression=numpy.concatenate((linear_regression_model_train_predicted,linear_regression_model_test_predicted))
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
pyplot.plot(maindata['date'],line_of_regression,label='Line of Regression')
pyplot.plot(train_data['date'],linear_regression_model_train_predicted,label='Train Data')
pyplot.plot(test_data['date'],linear_regression_model_test_predicted,label='Test Data')
pyplot.plot(daterange,linear_regression_model_next_days_predicted,label='Future Prediction')
pyplot.xlabel('Date')
pyplot.ylabel('{}'.format(name_of_case))
pyplot.legend()
pyplot.show()

#### Ramdom Forest Regression regression

In [141]:
random_forest_model.fit(train_features,train_labels)

RandomForestRegressor()

In [142]:
random_forest_model_train_predicted=random_forest_model.predict(train_features)

In [143]:
# random_forest_model_train_predicted

In [144]:
random_forest_model_test_predicted=random_forest_model.predict(test_features)

In [145]:
# random_forest_model_test_predicted

In [146]:
random_forest_model_next_days_predicted=random_forest_model.predict(predictingdate)

In [147]:
# random_forest_model_next_days_predicted

In [44]:
pyplot.figure('random forest regression')
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
pyplot.plot(train_data['date'],random_forest_model_train_predicted,label='Train Data')
pyplot.plot(test_data['date'],random_forest_model_test_predicted,label='Test Data')
pyplot.plot(daterange,random_forest_model_next_days_predicted,label='Future Prediction')
pyplot.xlabel('Date')
pyplot.ylabel('{}'.format(name_of_case))
pyplot.legend()
pyplot.show()

#### Decision Tree regression

In [148]:
decision_tree_model.fit(train_features,train_labels)

DecisionTreeRegressor()

In [149]:
decision_tree_model_train_predicted=decision_tree_model.predict(train_features)

In [150]:
# decision_tree_model_train_predicted

In [151]:
decision_tree_model_test_predicted=decision_tree_model.predict(test_features)

In [152]:
# decision_tree_model_test_predicted

In [153]:
decision_tree_model_next_days_predicted=decision_tree_model.predict(predictingdate)

In [154]:
# decision_tree_model_next_days_predicted

In [155]:
pyplot.figure('Decision tree regression')
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
pyplot.plot(train_data['date'],decision_tree_model_train_predicted,label='Train Data')
pyplot.plot(test_data['date'],decision_tree_model_test_predicted,label='Test Data')
pyplot.plot(daterange,decision_tree_model_next_days_predicted,label='Future Prediction')
pyplot.xlabel('Date')
pyplot.ylabel('{}'.format(name_of_case))
pyplot.legend()
pyplot.show()

## Evaluation
    model
    train : mean_absolute_error     root_mean_squared_error
    test :  mean_absolute_error     root_mean_squared_error

In [156]:
print('linear regrassion')
print(mean_absolute_error(linear_regression_model_train_predicted,train_labels),numpy.sqrt(mean_squared_error(linear_regression_model_train_predicted,train_labels)))
print(mean_absolute_error(linear_regression_model_test_predicted,test_labels),numpy.sqrt(mean_squared_error(linear_regression_model_test_predicted,test_labels)))

linear regrassion
283.7287960292181 368.0271266950748
439.50933464417176 444.49449843172175


In [157]:
print('random forest regrassion')
print(mean_absolute_error(random_forest_model_train_predicted,train_labels),numpy.sqrt(mean_squared_error(random_forest_model_train_predicted,train_labels)))
print(mean_absolute_error(random_forest_model_test_predicted,test_labels),numpy.sqrt(mean_squared_error(random_forest_model_test_predicted,test_labels)))

random forest regrassion
25.4032319391635 46.39651595514114
70.52428571428571 119.64652296302997


In [158]:
print('Decision Tree regression')
print(mean_absolute_error(decision_tree_model_train_predicted,train_labels),numpy.sqrt(mean_squared_error(decision_tree_model_train_predicted,train_labels)))
print(mean_absolute_error(decision_tree_model_test_predicted,test_labels),numpy.sqrt(mean_squared_error(decision_tree_model_test_predicted,test_labels)))

Decision Tree regression
0.0 0.0
67.92857142857143 118.74071392263541


## Polynomial Regression

In [159]:
# pyplot.scatter(train_features,train_labels)

In [160]:
def polynomial_regrassion_model(degree_of_equation=1):
    polynomial=PolynomialFeatures(degree=degree_of_equation)
    polynomial_train_data=polynomial.fit_transform(train_features.to_numpy())
    polynomial_test_data=polynomial.fit_transform(test_features.to_numpy())
    polynomial_future_test_data=polynomial.fit_transform(predictingdate)
    polynomial_regression=LinearRegression()
    polynomial_regression.fit(polynomial_train_data,train_labels)
    polynomial_regression_model_train_predicted=polynomial_regression.predict(polynomial_train_data)
    polynomial_regression_model_test_predicted=polynomial_regression.predict(polynomial_test_data)
    polynomial_regression_model_next_days_predicted=polynomial_regression.predict(polynomial_future_test_data)
#     print('Train Data Prediction')
#     print(polynomial_regression_model_train_predicted)
#     print('\n\n\nTest Data Prediction')
#     print(polynomial_regression_model_test_predicted)
#     print('\n\n\nFuture Data Prediction')
#     print(polynomial_regression_model_next_days_predicted)
#     print('\n\n\n')
    print('{} degree Polynomial regression'.format(degree_of_equation))
    print(mean_absolute_error(polynomial_regression_model_train_predicted,train_labels),numpy.sqrt(mean_squared_error(polynomial_regression_model_train_predicted,train_labels)))
    print(mean_absolute_error(polynomial_regression_model_test_predicted,test_labels),numpy.sqrt(mean_squared_error(polynomial_regression_model_test_predicted,test_labels)))
    pyplot.figure('{} degree Polynomial regression'.format(degree_of_equation))
    pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
    pyplot.plot(train_data['date'],polynomial_regression_model_train_predicted,label='Train Data')
    pyplot.plot(test_data['date'],polynomial_regression_model_test_predicted,label='Test Data')
    pyplot.plot(daterange,polynomial_regression_model_next_days_predicted,label='Future Prediction')
    pyplot.xlabel('Date')
    pyplot.ylabel('{}'.format(name_of_case))
    pyplot.legend()
    pyplot.show()

In [161]:
# polynomial_regrassion_model()

In [162]:
polynomial_regrassion_model(2)

2 degree Polynomial regression
273.43034125552884 359.26357130628537
244.29481748172216 246.14881037222935


In [61]:
polynomial_regrassion_model(3)

3 degree Polynomial regression
273.4283158375736 359.2611594529157
244.23563034193856 246.0910541563638


In [62]:
polynomial_regrassion_model(4)

4 degree Polynomial regression
273.42629178633257 359.2587466446998
244.1765894698245 246.03344171185472


In [63]:
polynomial_regrassion_model(5)

5 degree Polynomial regression
273.42426577170767 359.25633287964564
244.11752731991666 245.97581110958728


In [64]:
polynomial_regrassion_model(6)

6 degree Polynomial regression
273.422239291124 359.2539181638695
244.0584507680365 245.9181689964809


# Forcasting Models

In [163]:
# order-1 (prefered for india level)
# order_of_forecasting_models1=(5,0,2)
# order_of_forecasting_models2=(4,0,3)

# order-2 (prefered for state and district level)
order_of_forecasting_models1=(5,0,3)
order_of_forecasting_models2=(3,0,3)

In [164]:
forcasting_data_features=maindata[['date',type_of_case]]

forcasting_train_data_features=train_data[['date',type_of_case]]

forcasting_test_data_features=test_data[['date',type_of_case]]

In [165]:
forcasting_data_features['date']=pandas.to_datetime(forcasting_data_features['date'])

forcasting_train_data_features['date']=pandas.to_datetime(forcasting_train_data_features['date'])

forcasting_test_data_features['date']=pandas.to_datetime(forcasting_test_data_features['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forcasting_data_features['date']=pandas.to_datetime(forcasting_data_features['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forcasting_train_data_features['date']=pandas.to_datetime(forcasting_train_data_features['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forcasting_test_data_

In [166]:
forcasting_data_features=forcasting_data_features.set_index('date')

forcasting_train_data_features=forcasting_train_data_features.set_index('date')

forcasting_test_data_features=forcasting_test_data_features.set_index('date')

In [167]:
# forcasting_data_features

In [168]:
# forcasting_train_data_features

In [169]:
# forcasting_test_data_features

## ARIMA model

In [170]:
main_arima_model = ARIMA(forcasting_data_features,order=order_of_forecasting_models1)
train_arima_model = ARIMA(forcasting_train_data_features,order=order_of_forecasting_models1)



In [171]:
main_arima_fit_model=main_arima_model.fit()
train_arima_fit_model=train_arima_model.fit()

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


In [172]:
main_arima_fit_model_predicted=main_arima_fit_model.predict()
train_arima_fit_model_predicted=train_arima_fit_model.predict()
test_arima_predicted=train_arima_fit_model.forecast(steps=len(forcasting_test_data_features))

In [173]:
arima_model_forcast=main_arima_fit_model.forecast(steps=number_of_next_days)

In [174]:
# main_arima_fit_model_predicted
# train_arima_fit_model_predicted
# test_arima_predicted
arima_model_forcast

2021-11-01    27.857880
2021-11-02    55.117316
2021-11-03    35.891366
2021-11-04    47.003866
2021-11-05    53.861303
2021-11-06    43.184289
2021-11-07    63.668474
Freq: D, Name: predicted_mean, dtype: float64

In [76]:
pyplot.figure('ARIMA model Forcasting')
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
pyplot.plot(train_data['date'],train_arima_fit_model_predicted,label='Train Data')
pyplot.plot(test_data['date'],test_arima_predicted,label='Test Data')
pyplot.plot(daterange,arima_model_forcast,label='ARIMA Future Prediction')
pyplot.xlabel('Date')
pyplot.ylabel('{}'.format(name_of_case))
pyplot.legend()
pyplot.show()

In [175]:
print(mean_absolute_error(train_arima_fit_model_predicted,train_labels),numpy.sqrt(mean_squared_error(train_arima_fit_model_predicted,train_labels)))
print(mean_absolute_error(test_arima_predicted,test_labels),numpy.sqrt(mean_squared_error(test_arima_predicted,test_labels)))
    

74.74710378044784 128.9800894057669
104.23495538079318 143.42273131150176


In [176]:
try_arima_labels=numpy.concatenate((maindata_labels,arima_model_forcast))

In [177]:
try_arima_labels[-number_of_next_days:]

array([27.85787979, 55.11731605, 35.89136579, 47.00386558, 53.86130288,
       43.1842891 , 63.66847391])

In [178]:
random_forest_model.fit(pandas.DataFrame(try_data),try_arima_labels)

RandomForestRegressor()

In [179]:
final_arima_model_forcast=random_forest_model.predict(pandas.DataFrame(try_data))[-number_of_next_days:]

In [180]:
final_arima_model_forcast

array([29.57901125, 45.26406904, 41.62491264, 45.31768475, 50.8764882 ,
       47.48744061, 58.79092497])

## SARIMA model

In [181]:
main_sarima_model=SARIMAX(forcasting_data_features,order=order_of_forecasting_models2)
train_sarima_model = SARIMAX(forcasting_train_data_features,order=order_of_forecasting_models2)



In [182]:
main_sarima_fit_model=main_sarima_model.fit()
train_sarima_fit_model=train_sarima_model.fit()

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


In [183]:
# sarima_model_fit.predict(590)
main_sarima_fit_model_predicted=main_sarima_fit_model.predict()
train_sarima_fit_model_predicted=train_sarima_fit_model.predict()
test_sarima_predicted=train_sarima_fit_model.forecast(steps=len(forcasting_test_data_features))

In [184]:
sarima_model_forcast=main_sarima_fit_model.forecast(steps=number_of_next_days)

In [185]:
# main_sarima_fit_model_predicted
# train_sarima_fit_model_predicted
# test_sarima_predicted
sarima_model_forcast

2021-11-01    25.656719
2021-11-02    38.881787
2021-11-03    24.928360
2021-11-04    33.656291
2021-11-05    32.629916
2021-11-06    25.355898
2021-11-07    37.157403
Freq: D, Name: predicted_mean, dtype: float64

In [88]:
pyplot.figure('SARIMA model Forcasting')
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
pyplot.plot(train_data['date'],train_sarima_fit_model_predicted,label='Train Data')
pyplot.plot(test_data['date'],test_sarima_predicted,label='Test Data')
pyplot.plot(daterange,sarima_model_forcast,label='SARIMA Future Prediction')
pyplot.xlabel('Date')
pyplot.ylabel('{}'.format(name_of_case))
pyplot.legend()
pyplot.show()

In [186]:
print(mean_absolute_error(train_sarima_fit_model_predicted,train_labels),numpy.sqrt(mean_squared_error(train_sarima_fit_model_predicted,train_labels)))
print(mean_absolute_error(test_sarima_predicted,test_labels),numpy.sqrt(mean_squared_error(test_sarima_predicted,test_labels)))
    

74.20716118750424 128.89884534571124
61.03026576502107 113.62207386040494


In [187]:
try_sarima_labels=numpy.concatenate((maindata_labels,sarima_model_forcast))

In [188]:
try_sarima_labels[-number_of_next_days:]

array([25.65671933, 38.88178735, 24.92835974, 33.65629089, 32.62991571,
       25.35589812, 37.15740348])

In [189]:
random_forest_model.fit(pandas.DataFrame(try_data),try_sarima_labels)

RandomForestRegressor()

In [190]:
final_sarima_model_forcast=random_forest_model.predict(pandas.DataFrame(try_data))[-number_of_next_days:]

In [191]:
final_sarima_model_forcast

array([27.4269051 , 33.42079154, 29.44777397, 31.49764422, 31.92112327,
       27.75659057, 32.90247619])

## Evaluation
  
    train : mean_absolute_error     root_mean_squared_error
    test :  mean_absolute_error     root_mean_squared_error

In [192]:
print('ARIMA model')
print(mean_absolute_error(train_arima_fit_model_predicted,train_labels),numpy.sqrt(mean_squared_error(train_arima_fit_model_predicted,train_labels)))
print(mean_absolute_error(test_arima_predicted,test_labels),numpy.sqrt(mean_squared_error(test_arima_predicted,test_labels)))
    

ARIMA model
74.74710378044784 128.9800894057669
104.23495538079318 143.42273131150176


In [193]:
print('SARIMA model')
print(mean_absolute_error(train_sarima_fit_model_predicted,train_labels),numpy.sqrt(mean_squared_error(train_sarima_fit_model_predicted,train_labels)))
print(mean_absolute_error(test_sarima_predicted,test_labels),numpy.sqrt(mean_squared_error(test_sarima_predicted,test_labels)))
    

SARIMA model
74.20716118750424 128.89884534571124
61.03026576502107 113.62207386040494


## Final Model Forecast

In [194]:
final_arima_model_forcast

array([29.57901125, 45.26406904, 41.62491264, 45.31768475, 50.8764882 ,
       47.48744061, 58.79092497])

In [195]:
final_sarima_model_forcast

array([27.4269051 , 33.42079154, 29.44777397, 31.49764422, 31.92112327,
       27.75659057, 32.90247619])

In [196]:
final_data_forcast=(final_arima_model_forcast+final_sarima_model_forcast)/2

In [197]:
final_data_forcast

array([28.50295818, 39.34243029, 35.5363433 , 38.40766449, 41.39880574,
       37.62201559, 45.84670058])

In [198]:
final_forecast_labels=numpy.concatenate((maindata_labels,final_data_forcast))

In [199]:
final_forecast_labels[-number_of_next_days:]

array([28.50295818, 39.34243029, 35.5363433 , 38.40766449, 41.39880574,
       37.62201559, 45.84670058])

In [200]:
random_forest_model.fit(pandas.DataFrame(try_data),final_forecast_labels)

RandomForestRegressor()

In [201]:
final_model_forcast=random_forest_model.predict(pandas.DataFrame(try_data))[-number_of_next_days:]

In [202]:
maindata_labels[-number_of_next_days:]

547    41
548    41
549    39
550    48
551    59
552    13
553    26
Name: delta_confirmed, dtype: int64

In [203]:
final_model_forcast

array([29.04078519, 36.05585173, 36.63565153, 38.01704926, 40.24097562,
       38.99387518, 42.81078551])

In [204]:
final_prediction=pandas.DataFrame()

In [205]:
final_prediction['Future Dates']=daterange

In [206]:
final_prediction['ARIMA Prediction']=arima_model_forcast.to_numpy()

In [207]:
final_prediction['SARIMA Prediction']=sarima_model_forcast.to_numpy()

In [208]:
def range_finder(x,y):
    if x<y:
        return(x,y)
    else:
        return(y,x)
range_of_prediction=[]
for i in range(number_of_next_days):
    range_of_prediction.append(range_finder(arima_model_forcast.to_numpy()[i],sarima_model_forcast.to_numpy()[i]))
# print(range_of_prediction)
excepted_range_of_prediction=["{}   to   {}".format(int(value[0]),int(value[1])) for value in range_of_prediction]
print(excepted_range_of_prediction)

['25   to   27', '38   to   55', '24   to   35', '33   to   47', '32   to   53', '25   to   43', '37   to   63']


In [209]:
final_prediction['Excepted Range of Prediction']=excepted_range_of_prediction

In [210]:
final_prediction['Final Excepted Prediction']=final_model_forcast.astype(int)

In [211]:
final_prediction['Future Dates']=[str(value) for value in daterange]

In [214]:
final_prediction

Unnamed: 0,Future Dates,ARIMA Prediction,SARIMA Prediction,Excepted Range of Prediction,Final Excepted Prediction
0,2021-11-01,27.85788,25.656719,25 to 27,29
1,2021-11-02,55.117316,38.881787,38 to 55,36
2,2021-11-03,35.891366,24.92836,24 to 35,36
3,2021-11-04,47.003866,33.656291,33 to 47,38
4,2021-11-05,53.861303,32.629916,32 to 53,40
5,2021-11-06,43.184289,25.355898,25 to 43,38
6,2021-11-07,63.668474,37.157403,37 to 63,42


In [215]:
# final_prediction.to_json()

In [216]:
pyplot.figure('Final model Forcasting')
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
pyplot.plot(daterange,arima_model_forcast,label='ARIMA Future Prediction')
pyplot.plot(daterange,sarima_model_forcast,label='SARIMA Future Prediction')
pyplot.plot(daterange,final_model_forcast,label='final Future {} Prediction'.format(name_of_case))

for i in range(number_of_next_days):
    pyplot.text(daterange[i],arima_model_forcast.to_numpy()[i],'{}'.format(int(arima_model_forcast.to_numpy()[i])))
for i in range(number_of_next_days):
    pyplot.text(daterange[i],sarima_model_forcast.to_numpy()[i],'{}'.format(int(sarima_model_forcast.to_numpy()[i])))
for i in range(number_of_next_days):
    pyplot.text(daterange[i],final_model_forcast[i],'{}'.format(int(final_model_forcast[i])))
    
pyplot.xlabel('Date')
pyplot.ylabel('{}'.format(name_of_case))

pyplot.grid(axis='x')
pyplot.legend()
pyplot.show()

In [217]:
# train_absolute_error=[]
# test_absolute_error=[]
# train_rmse=[]
# test_rmse=[]
# index=[]
# for p in range(7):
#     for q in range(7):
#         for r in range(7):
#             try:
#                 print('\nfor p={} , q={} , r={}'.format(p,q,r))
#                 train_sarima_model = ARIMA(forcasting_train_data_features,order=(p,q,r))
#                 train_sarima_fit_model=train_sarima_model.fit()
#                 train_sarima_fit_model_predicted=train_sarima_fit_model.predict()
#                 test_sarima_predicted=train_sarima_fit_model.forecast(steps=len(forcasting_test_data_features))
#                 train_absolute_error.append(mean_absolute_error(train_sarima_fit_model_predicted,train_labels))
#                 train_rmse.append(numpy.sqrt(mean_squared_error(train_sarima_fit_model_predicted,train_labels)))
#                 test_absolute_error.append(mean_absolute_error(test_sarima_predicted,test_labels))
#                 test_rmse.append(numpy.sqrt(mean_squared_error(test_sarima_predicted,test_labels)))
#                 index.append("{}{}{}".format(p,q,r))
#                 print(mean_absolute_error(train_sarima_fit_model_predicted,train_labels),numpy.sqrt(mean_squared_error(train_sarima_fit_model_predicted,train_labels)))
#                 print(mean_absolute_error(test_sarima_predicted,test_labels),numpy.sqrt(mean_squared_error(test_sarima_predicted,test_labels)))
#             except Exception as e:
#                 print(e)
# errors=pandas.DataFrame()
# errors['index']=index
# errors['train_absolute_error']=train_absolute_error
# errors['test_absolute_error']=test_absolute_error
# errors['train_rmse']=train_rmse
# errors['test_rmse']=test_rmse

In [218]:
# errors[errors['test_absolute_error']==min(errors['test_absolute_error'])]

## VAR Model

In [219]:
forcasting_data_features_for_VAR=maindata[['date','delta_confirmed','delta_vaccinated1','delta_vaccinated2']]
forcasting_train_data_features_for_VAR=train_data[['date','delta_confirmed','delta_vaccinated1','delta_vaccinated2']]

In [220]:
forcasting_data_features_for_VAR
# forcasting_train_data_features_for_VAR

Unnamed: 0,date,delta_confirmed,delta_vaccinated1,delta_vaccinated2
0,2020-04-26,0,0,0
1,2020-04-27,4,0,0
2,2020-04-28,5,0,0
3,2020-04-29,3,0,0
4,2020-04-30,2,0,0
...,...,...,...,...
549,2021-10-27,39,3536,4449
550,2021-10-28,48,2742,4458
551,2021-10-29,59,2803,4641
552,2021-10-30,13,3027,5713


In [221]:
forcasting_data_features_for_VAR['date']=pandas.to_datetime(forcasting_data_features_for_VAR['date'])
forcasting_train_data_features_for_VAR['date']=pandas.to_datetime(forcasting_train_data_features_for_VAR['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forcasting_data_features_for_VAR['date']=pandas.to_datetime(forcasting_data_features_for_VAR['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forcasting_train_data_features_for_VAR['date']=pandas.to_datetime(forcasting_train_data_features_for_VAR['date'])


In [222]:
# forcasting_data_features_for_VAR
forcasting_train_data_features_for_VAR

Unnamed: 0,date,delta_confirmed,delta_vaccinated1,delta_vaccinated2
0,2020-04-26,0,0,0
1,2020-04-27,4,0,0
2,2020-04-28,5,0,0
3,2020-04-29,3,0,0
4,2020-04-30,2,0,0
...,...,...,...,...
521,2021-09-29,157,45182,16086
522,2021-09-30,126,25650,9726
523,2021-10-01,120,28392,9694
524,2021-10-02,140,15961,9604


In [223]:
forcasting_data_features_for_VAR=forcasting_data_features_for_VAR.set_index('date')
forcasting_train_data_features_for_VAR=forcasting_train_data_features_for_VAR.set_index('date')

In [224]:
# forcasting_data_features_for_VAR
forcasting_train_data_features_for_VAR

Unnamed: 0_level_0,delta_confirmed,delta_vaccinated1,delta_vaccinated2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-26,0,0,0
2020-04-27,4,0,0
2020-04-28,5,0,0
2020-04-29,3,0,0
2020-04-30,2,0,0
...,...,...,...
2021-09-29,157,45182,16086
2021-09-30,126,25650,9726
2021-10-01,120,28392,9694
2021-10-02,140,15961,9604


In [225]:
main_var_model=VAR(forcasting_data_features_for_VAR)
main_var_train_model=VAR(forcasting_train_data_features_for_VAR)



In [226]:
main_var_model_fit=main_var_model.fit()
main_var_train_model_fit=main_var_train_model.fit()

In [227]:
# main_var_train_model_fit.plot()

In [228]:
var_model_forcast=main_var_model_fit.forecast(main_var_model.y,steps=7)
var_model_test_forcast=main_var_train_model_fit.forecast(main_var_model.y,steps=len(test_data))

  var_model_forcast=main_var_model_fit.forecast(main_var_model.y,steps=7)
  var_model_test_forcast=main_var_train_model_fit.forecast(main_var_model.y,steps=len(test_data))


In [229]:
var_model_test_forcast_data=pandas.DataFrame(var_model_test_forcast)

In [230]:
var_model_forcast_data=pandas.DataFrame(var_model_forcast)

In [231]:
var_model_forcast_data

Unnamed: 0,0,1,2
0,46.550501,2051.312027,1346.006853
1,69.628548,2692.703955,1408.086804
2,92.230286,2949.496518,1437.615898
3,113.456902,3064.452313,1451.040114
4,133.106215,3123.918533,1456.154883
5,151.201524,3159.796637,1456.899082
6,167.834353,3184.543962,1455.359377


In [232]:
pyplot.figure('VAR model Forcasting')
pyplot.plot(maindata['date'],maindata_labels,label='Original Data')
# pyplot.plot(maindata['date'],maindata['delta_vaccinated1'],label='Original Data')
pyplot.plot(test_data['date'],var_model_test_forcast_data[0],label='VAR Testing Prediction')
pyplot.plot(daterange,var_model_forcast_data[0],label='VAR Future Confirmed Prediction')
# pyplot.plot(daterange,var_model_forcast_data[1],label='VAR Future Vaccinated1 Prediction')

pyplot.legend()
pyplot.show()

In [233]:
print(mean_absolute_error(var_model_test_forcast_data[0],test_labels),numpy.sqrt(mean_squared_error(var_model_test_forcast_data[0],test_labels)))


191.91474643331475 230.41637384048587
