On 11 March 2020, the World Health Organization (WHO) officially classified COVID-19 as a pandemic. At the time, the number of confirmed cases were over 130,000 across 114 countries around the world, with the number of deaths caused by the disease over 5,000.
The objective of this project is to build an epidemiological model that predicts the spread of COVID-19 throughout the world. The target variable is the cumulative number of deaths caused by COVID-19 in each country by each date.

# Data

The train data contains data from 22 January to 5 March 2020. “Target” is the number of confirmed deaths and “Cases” is the number of reported infections.

In [1]:
import pandas as pd
data=pd.read_csv('C:/Users/Rayen/Desktop/AI/train.csv')
predf=pd.read_csv('C:/Users/Rayen/Desktop/AI/SampleSubLocal.csv')
predt=pd.read_csv('C:/Users/Rayen/Desktop/AI/SampleSubmission1.csv')

In [29]:
data['Territory']=data['Territory X Date'].str.split('X',1).str[0].str.strip() 
data['Date']=pd.to_datetime(data['Date'])
dic=data['target'].to_dict()

In [2]:
data.head()

Unnamed: 0,Territory X Date,target,cases,Territory,Date
0,Afghanistan X 1/22/20,0,0,Afghanistan,1/22/20
1,Afghanistan X 1/23/20,0,0,Afghanistan,1/23/20
2,Afghanistan X 1/24/20,0,0,Afghanistan,1/24/20
3,Afghanistan X 1/25/20,0,0,Afghanistan,1/25/20
4,Afghanistan X 1/26/20,0,0,Afghanistan,1/26/20


In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import itertools
import math
from statsmodels.tsa.arima_model import ARIMA
import warnings
%matplotlib inline

# Hyperparameters Tuning

In [31]:
from sklearn.metrics import mean_squared_error
"""
Function that finds the best parameters for the Arima model. The metric used to evaluate the models is the Mean Squared error.
"""
def find_best_arima(train):
    warnings.filterwarnings("ignore")
    train = train.astype('float32')
    best_score = float("inf")
    best_cfg=[]
    p_values = [0, 1, 2,3]
    d_values= [0,1,2,3]
    q_values= [0,1,2,3]
    predictions =[]
    train_size = int(len(train)* 0.66 ) 
    train_set, test_set = train[0:train_size], train[train_size:] #data splitting
    history = [x for x in train_set]
    
    param=(0,0,0) ##
    best_trend=[]
    best_method=[]
    for p in p_values:
        for d in d_values:
            for q in q_values:

                param = (p,d,q)
                try:
                    train_size = int(len(train)* 0.66 )
                    train_set, test_set = train[0:train_size], train[train_size:]
                    history = [x for x in train_set]
                    predictions =list()

                    for t in range(len(test_set)):
                        mod = ARIMA(history, order = param)
                        results = mod.fit(disp=False)
                        yhat = results.forecast()[0]
                        predictions.append(yhat)
                        history.append(test_set[t])
                    error = mean_squared_error(test_set, predictions)
                    if (error < best_score):
                        best_score, best_cfg = error,param
                        print('ARIMA%s MSE=%.3f ' % (best_cfg,error)) ##
                except:
                    continue
                    
    print(best_cfg)
    print(best_score)
    
    best_model = { 
                  'order':best_cfg,
                  'mse':best_score
                 }
    
    return best_model

In [32]:
"""
Forecasting the number of deaths in each country for the next two months
"""
def prediction (order,series):
        
        series = series.astype('float32')
        model = ARIMA(series, order=order)
        model_fit = model.fit(disp=False)
        output = model_fit.forecast(steps=67) #steps=number of days to predict
        yhat = output[0]

        return(yhat)

# Forecasting

In [34]:
predictions=[]
shit=[]
"""
Forecasting the number of deaths due to COVID-19 in each country for the next two months
"""
for i in data['Territory'].unique():
    start_date='2020-03-06'
    end_date='2020-04-01'
    print("**********************"+i+"**********************")
    x=data.loc[data['Territory']==i]
    x=x[['Date','target']]
    
    mask = (x['Date'] >= start_date) & (x['Date'] <= end_date)
    z = x.loc[mask]
    
    x.index=x['Date']
    x=x.drop(columns=['Date'])
    series = x
    best_model=find_best_arima(series.values)
    print('best config is: ARIMA%s MSE=%.3f  '% (best_model['order'],best_model['mse']))
    yhat=prediction(best_model['order'],series.values)
    
    for r in z['target'].tolist(): 
        predictions.append(r)

    for k in yhat :
        predictions.append(k)

**********************Afghanistan**********************
ARIMA(0, 0, 0) MSE=4.209 
ARIMA(0, 1, 0) MSE=0.235 
(0, 1, 0)
0.23457294944010035
best config is: ARIMA(0, 1, 0) MSE=0.235  
**********************Albania**********************
ARIMA(0, 0, 0) MSE=32.710 
ARIMA(0, 1, 0) MSE=1.210 
(0, 1, 0)
1.209949939035383
best config is: ARIMA(0, 1, 0) MSE=1.210  
**********************Algeria**********************
ARIMA(0, 0, 0) MSE=385.326 
ARIMA(0, 1, 0) MSE=13.153 
ARIMA(0, 2, 0) MSE=3.948 
(0, 2, 0)
3.9480842879707323
best config is: ARIMA(0, 2, 0) MSE=3.948  
**********************Andorra**********************
ARIMA(0, 0, 0) MSE=17.508 
ARIMA(0, 1, 0) MSE=1.429 
ARIMA(0, 2, 0) MSE=1.135 
(0, 2, 0)
1.1350158059488573
best config is: ARIMA(0, 2, 0) MSE=1.135  
**********************Angola**********************
ARIMA(0, 0, 0) MSE=0.613 
ARIMA(0, 1, 0) MSE=0.160 
(0, 1, 0)
0.16010385113147962
best config is: ARIMA(0, 1, 0) MSE=0.160  
**********************Antigua and Barbuda******************

(0, 1, 0)
0.04013241645948923
best config is: ARIMA(0, 1, 0) MSE=0.040  
**********************Cyprus**********************
ARIMA(0, 0, 0) MSE=10.667 
ARIMA(0, 1, 0) MSE=0.562 
(0, 1, 0)
0.5623708013850284
best config is: ARIMA(0, 1, 0) MSE=0.562  
**********************Czechia**********************
ARIMA(0, 0, 0) MSE=133.285 
ARIMA(0, 1, 0) MSE=8.414 
ARIMA(0, 2, 0) MSE=1.344 
(0, 2, 0)
1.3437940593797995
best config is: ARIMA(0, 2, 0) MSE=1.344  
**********************Côte d'Ivoire**********************
ARIMA(0, 0, 0) MSE=0.153 
ARIMA(0, 1, 0) MSE=0.040 
(0, 1, 0)
0.04002596277809858
best config is: ARIMA(0, 1, 0) MSE=0.040  
**********************Democratic People's Republic of Korea (the)**********************
ARIMA(0, 0, 0) MSE=7283.069 
ARIMA(0, 0, 1) MSE=2137.293 
ARIMA(0, 1, 0) MSE=17.140 
ARIMA(0, 1, 1) MSE=16.193 
ARIMA(0, 1, 2) MSE=12.427 
ARIMA(0, 1, 3) MSE=12.219 
ARIMA(0, 2, 1) MSE=7.353 
(0, 2, 1)
7.353136891307131
best config is: ARIMA(0, 2, 1) MSE=7.353  
*************

(0, 2, 0)
14.265945888904788
best config is: ARIMA(0, 2, 0) MSE=14.266  
**********************Israel**********************
ARIMA(0, 0, 0) MSE=70.751 
ARIMA(0, 1, 0) MSE=3.506 
ARIMA(0, 2, 0) MSE=2.010 
(0, 2, 0)
2.0098414447279698
best config is: ARIMA(0, 2, 0) MSE=2.010  
**********************Italy**********************
ARIMA(0, 0, 0) MSE=34205642.710 
ARIMA(0, 0, 1) MSE=9146784.739 
ARIMA(0, 1, 0) MSE=246267.972 
ARIMA(0, 1, 1) MSE=103880.520 
ARIMA(0, 2, 0) MSE=22772.986 
ARIMA(0, 2, 1) MSE=17122.767 
(0, 2, 1)
17122.7672860918
best config is: ARIMA(0, 2, 1) MSE=17122.767  
**********************Jamaica**********************
ARIMA(0, 0, 0) MSE=0.747 
ARIMA(0, 1, 0) MSE=0.198 
(0, 1, 0)
0.19781357599249383
best config is: ARIMA(0, 1, 0) MSE=0.198  
**********************Japan**********************
ARIMA(0, 0, 0) MSE=928.755 
ARIMA(0, 0, 1) MSE=248.270 
ARIMA(0, 1, 0) MSE=5.354 
ARIMA(0, 1, 1) MSE=5.265 
ARIMA(0, 2, 1) MSE=4.061 
(0, 2, 1)
4.061410794949064
best config is: ARIMA(0, 

ARIMA(0, 1, 0) MSE=7.163 
ARIMA(0, 2, 0) MSE=4.256 
(0, 2, 0)
4.255627953812155
best config is: ARIMA(0, 2, 0) MSE=4.256  
**********************Oman**********************
ARIMA(0, 0, 0) MSE=0.079 
ARIMA(0, 1, 0) MSE=0.040 
(0, 1, 0)
0.040008401593606975
best config is: ARIMA(0, 1, 0) MSE=0.040  
**********************Pakistan**********************
ARIMA(0, 0, 0) MSE=91.327 
ARIMA(0, 1, 0) MSE=3.516 
ARIMA(0, 2, 0) MSE=2.396 
(0, 2, 0)
2.395986312315525
best config is: ARIMA(0, 2, 0) MSE=2.396  
**********************Palau**********************
ARIMA(0, 0, 0) MSE=0.000 
(0, 0, 0)
0.0
best config is: ARIMA(0, 0, 0) MSE=0.000  
**********************Panama**********************
ARIMA(0, 0, 0) MSE=112.585 
ARIMA(0, 1, 0) MSE=5.082 
ARIMA(0, 2, 0) MSE=3.981 
(0, 2, 0)
3.9808715680570277
best config is: ARIMA(0, 2, 0) MSE=3.981  
**********************Papua New Guinea**********************
ARIMA(0, 0, 0) MSE=0.000 
(0, 0, 0)
0.0
best config is: ARIMA(0, 0, 0) MSE=0.000  
*******************

ARIMA(1, 2, 1) MSE=0.391 
ARIMA(2, 2, 1) MSE=0.389 
(2, 2, 1)
0.38875737733352417
best config is: ARIMA(2, 2, 1) MSE=0.389  
**********************Tajikistan**********************
ARIMA(0, 0, 0) MSE=0.000 
(0, 0, 0)
0.0
best config is: ARIMA(0, 0, 0) MSE=0.000  
**********************Thailand**********************
ARIMA(0, 0, 0) MSE=16.265 
ARIMA(0, 0, 1) MSE=6.383 
ARIMA(0, 1, 0) MSE=0.770 
ARIMA(0, 1, 1) MSE=0.720 
ARIMA(0, 1, 2) MSE=0.709 
ARIMA(0, 2, 1) MSE=0.534 
ARIMA(2, 2, 1) MSE=0.502 
(2, 2, 1)
0.5017505931459586
best config is: ARIMA(2, 2, 1) MSE=0.502  
**********************Timor-Leste**********************
ARIMA(0, 0, 0) MSE=0.000 
(0, 0, 0)
0.0
best config is: ARIMA(0, 0, 0) MSE=0.000  
**********************Togo**********************
ARIMA(0, 0, 0) MSE=0.337 
ARIMA(0, 1, 0) MSE=0.079 
(0, 1, 0)
0.0788851927312329
best config is: ARIMA(0, 1, 0) MSE=0.079  
**********************Tonga**********************
ARIMA(0, 0, 0) MSE=0.000 
(0, 0, 0)
0.0
best config is: ARIMA(0, 0,

+ Rounding the numbers to have a better accuarcy

In [44]:
predictionsf= []
for i in predictions:
    k=i//1
    if (i<0):
        predictionsf.append(0)
  
    elif ( i-k >0.15):
        predictionsf.append(int(k+1))
    elif (i-k>0.4):
        predictionsf.append(int(k+2))
    elif (i-k>0.7):
        predictionsf.append(int(k+3))
    elif (i-k>0.9):
        predictionsf.append(int(k+4))
    else:
        predictionsf.append(int(k))
print(predictionsf)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 5, 5, 6, 8, 10, 10, 11, 15, 15, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 30, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 7, 9, 11, 15, 17, 17, 19, 21, 25, 26, 29, 31, 35, 44, 58, 73, 87, 102, 116, 131, 147, 162, 178, 193, 210, 226, 242, 259, 276, 293, 310, 327, 345, 363, 381, 399, 418, 436, 455, 474, 494, 513, 533, 553, 573, 593, 613, 634, 655, 676, 697, 719, 741, 763, 785, 807, 830, 852, 875, 898, 922, 945, 969, 993, 1017, 1041, 1066, 1091, 11

In [52]:
for i in range(len(predt)):
    predt.loc[i,'target']=predictionsf[i]

+ The final prediction:

In [53]:
predf.head()

Unnamed: 0,Territory X Date,target
0,Afghanistan X 4/2/20,0
1,Afghanistan X 4/3/20,0
2,Afghanistan X 4/4/20,0
3,Afghanistan X 4/5/20,0
4,Afghanistan X 4/6/20,0


In [55]:
predt=predt[['Territory X Date','target']]

In [56]:
predt.to_csv("submission11.csv",index=False) #Converting the results from a dataframe to CSV