In [7]:
import pandas as pd

In [8]:
# columns of interest
features = ['date',
            'location', 
            'new_tests_per_thousand',
            'new_cases_per_million',
            'total_cases_per_million',
            'tests_per_case',
            'positive_rate',                    
            'reproduction_rate',                       
            'stringency_index',
            'total_deaths_per_million']           

In [9]:
target = ['new_deaths_per_million']

In [10]:
features + target

['date',
 'location',
 'new_tests_per_thousand',
 'new_cases_per_million',
 'total_cases_per_million',
 'tests_per_case',
 'positive_rate',
 'reproduction_rate',
 'stringency_index',
 'total_deaths_per_million',
 'new_deaths_per_million']

In [11]:
# download the full covid data from our-world-in-data
url_data = (r'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')
df = pd.read_csv(url_data, 
                 usecols = features + target)

In [12]:
# reorder columns
df = df[features + target]

In [13]:
# set date as index
df.set_index('date', inplace=True)

In [14]:
# Converting the index as date
df.index = pd.to_datetime(df.index)

In [30]:
# check data type
df.dtypes

location                     object
new_tests_per_thousand      float64
new_cases_per_million       float64
total_cases_per_million     float64
tests_per_case              float64
positive_rate               float64
reproduction_rate           float64
stringency_index            float64
total_deaths_per_million    float64
new_deaths_per_million      float64
dtype: object

In [15]:
## add month info
#df['month'] = df.index.month

In [16]:
df.columns

Index(['location', 'new_tests_per_thousand', 'new_cases_per_million',
       'total_cases_per_million', 'tests_per_case', 'positive_rate',
       'reproduction_rate', 'stringency_index', 'total_deaths_per_million',
       'new_deaths_per_million'],
      dtype='object')

In [17]:
df.head(5)

Unnamed: 0_level_0,location,new_tests_per_thousand,new_cases_per_million,total_cases_per_million,tests_per_case,positive_rate,reproduction_rate,stringency_index,total_deaths_per_million,new_deaths_per_million
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-02-24,Afghanistan,,0.125,0.125,,,,8.33,,
2020-02-25,Afghanistan,,0.0,0.125,,,,8.33,,
2020-02-26,Afghanistan,,0.0,0.125,,,,8.33,,
2020-02-27,Afghanistan,,0.0,0.125,,,,8.33,,
2020-02-28,Afghanistan,,0.0,0.125,,,,8.33,,


In [18]:
df.tail(5)

Unnamed: 0_level_0,location,new_tests_per_thousand,new_cases_per_million,total_cases_per_million,tests_per_case,positive_rate,reproduction_rate,stringency_index,total_deaths_per_million,new_deaths_per_million
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-08-30,Zimbabwe,,0.25,16050.747,,,,,349.704,0.0
2022-08-31,Zimbabwe,,1.125,16051.872,,,,,349.892,0.188
2022-09-01,Zimbabwe,,0.75,16052.622,,,,,349.892,0.0
2022-09-02,Zimbabwe,,0.375,16052.997,,,,,349.892,0.0
2022-09-03,Zimbabwe,,0.688,16053.685,,,,,349.892,0.0


In [19]:
# filter by country
def filter_country(dataframe:pd.DataFrame, LOCATION:str):
  """Filter dataframe by country. """
  return dataframe[dataframe.location == LOCATION]

# filter by date
def filter_date(dataframe:pd.DataFrame, start_date:str, end_date:str)->pd.DataFrame:
  """Filter dataframe by date range including start_date and end_date. """
  return dataframe.loc[(dataframe.index >= start_date) & (dataframe.index <= end_date)]

In [21]:
## crop date
#df_germany = filter_date(df_germany, '2020-03-10', '2022-09-01')
#df_germany.head(100)

# Let´s try Auto ML without any manual data prep in advance

In [22]:
# check number of unique countries
df['location'].nunique()

244

In [50]:
# plot multiple time series with moving avgs in a loop
import plotly.express as px

country = 'Germany'
df_country = filter_country(df, country)
df_country['deaths_per_million_average'] = df_country['new_deaths_per_million'].rolling(30, center=True).mean()
fig = px.line(df_country, x=df_country.index, y=["new_deaths_per_million","deaths_per_million_average"], title = country, template = 'plotly_dark')
fig.show()

In [31]:

from pycaret.regression import *

In [40]:
country = 'Germany'
df_country = filter_country(df, country)
df_country.head()

Unnamed: 0_level_0,location,new_tests_per_thousand,new_cases_per_million,total_cases_per_million,tests_per_case,positive_rate,reproduction_rate,stringency_index,total_deaths_per_million,new_deaths_per_million
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-27,Germany,,0.012,0.012,,,,5.56,,
2020-01-28,Germany,,0.036,0.048,,,,5.56,,
2020-01-29,Germany,,0.0,0.048,,,,5.56,,
2020-01-30,Germany,,0.0,0.048,,,,5.56,,
2020-01-31,Germany,,0.012,0.06,,,,5.56,,


In [51]:
# initialize setup from pycaret.regression
s = setup(  df_country, 
            target = 'new_deaths_per_million',
            train_size = 0.75,
            data_split_shuffle = False, 
            fold_strategy = 'timeseries', fold = 3,
            ignore_features = ['location', 'new_tests_per_thousand', 'tests_per_case', 'positive_rate', 'reproduction_rate', 'total_deaths_per_million'],
            silent = True, 
            verbose = False, 
            session_id = 123)

In [52]:
# compare all models and select best one based on MAE
best_model = compare_models(sort = 'MAE', verbose=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1.4184,6.3331,2.2213,0.2872,0.4986,1.1452,0.0233
xgboost,Extreme Gradient Boosting,1.5198,6.6177,2.2909,0.2433,0.5244,2.206,0.0333
rf,Random Forest Regressor,1.5267,6.8833,2.248,0.3166,0.5346,2.064,0.09
dt,Decision Tree Regressor,1.5556,6.8744,2.3653,0.1758,0.5522,2.3418,0.0067
omp,Orthogonal Matching Pursuit,1.5838,8.0793,2.286,0.3573,0.5116,1.5461,0.0067
et,Extra Trees Regressor,1.6208,7.9374,2.4237,0.1601,0.5276,2.1531,0.0767
catboost,CatBoost Regressor,1.7192,9.0689,2.5451,0.1423,0.5978,1.5111,0.51
lightgbm,Light Gradient Boosting Machine,1.7552,7.9366,2.544,-0.0659,0.5521,3.0119,0.0233
ada,AdaBoost Regressor,1.7774,7.3509,2.4775,0.0387,0.58,3.7972,0.0333
llar,Lasso Least Angle Regression,2.2173,11.6094,2.9894,-0.209,0.8093,6.1235,0.0067
