In [None]:
!pip install pmdarima

In [None]:
import copy
import numpy as np
import pandas as pd
import scipy.optimize as opt
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_log_error
import warnings; warnings.filterwarnings('ignore')
from scipy.stats import pearsonr
from scipy.optimize import curve_fit
from scipy.optimize import least_squares
from scipy import interpolate
from scipy.signal import savgol_filter
import statsmodels.api as sm
from pmdarima.arima import auto_arima
import seaborn as sns

In [None]:
# Load Data and parse datetime columns
train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/train.csv', parse_dates=['Date'])
test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/test.csv', parse_dates=['Date'])

# Pull in population data to help with model fitting
populations = pd.read_csv('/kaggle/input/covid19-global-forecasting-locations-population/locations_population.csv').iloc[:, :3]
populations.columns = ['Province_State', 'Country_Region', 'Population']

# Replace the NaN values with a space
train['Province_State'].fillna("",inplace = True)
test['Province_State'].fillna("",inplace = True)
populations['Province_State'].fillna("",inplace = True)

# Combine the Country_Rgion and Province_State columns into a single column
train['Country_Region'] = train['Country_Region'] + ' ' + train['Province_State']
test['Country_Region'] = test['Country_Region'] + ' ' + test['Province_State']
populations['Country_Region'] = populations['Country_Region'] + ' ' + populations['Province_State']

# Delete the Province_State column because it is accounted for elsewhere
del train['Province_State']
del test['Province_State']
del populations['Province_State']
Unique_Regions = train['Country_Region'].unique()

# Only use data up to 04/01 to get a public LB score
cutoff_date = min(test['Date'])
eval_set = train[train["Date"] >= cutoff_date]
number_of_overlap_days = int(len(eval_set)/len(Unique_Regions))
days_to_predict = len(test['Date'].unique())

# Feature Engineering

In [None]:
def new_feature_from_old(function):
    feature_cases = train[train['ConfirmedCases']>=1]['ConfirmedCases'].map(function)
    feature_fatalities = train['Fatalities'].map(function)
    Features = copy.copy(train[train['ConfirmedCases']>=1])
    Features['ConfirmedCases'] = feature_cases
    Features['Fatalities'] = feature_fatalities
    return Features

Feature_0 = new_feature_from_old(lambda x: x) # lol
Feature_1 = new_feature_from_old(lambda x: np.sqrt(x))
Feature_2 = new_feature_from_old(lambda x: np.log(x + 1))

# Validation Error

In [None]:
# Root mean squared logarithmic error
def RMSLE(pred,actual):
    return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))

# Data Slice Function

In [None]:
def slice_by_region_variable_leaderboard(region, variable_idx, leaderboard_flag, feature):
    # variable_idx: 1 --> confirmed cases, 2 --> fatalities 
    # leaderboard: 0 --> only use data before cutoff, 1 --> include all data
    
    missing_flag = 0
    if leaderboard_flag == 0:
        df = copy.copy(feature[(feature['Country_Region']==region) & (feature['Date'] < cutoff_date)])
    else:
        df = copy.copy(feature[feature['Country_Region']==region])
    
    if region not in feature['Country_Region'].unique():
        missing_flag = 1
        endog = np.zeroes(days_to_predict)
    
    else:
        if variable_idx == 1:
            endog  = df['ConfirmedCases'].values
        elif variable_idx == 2:
            endog  = df['Fatalities'].values

    return endog, missing_flag


# Arima Model - no transformation - Order selection with auto_arima

In [None]:
#https://stats.stackexchange.com/questions/313426/mle-convergence-errors-with-statespace-sarimax
def predict_0(region, variable_idx, leaderboard_flag):
    # variable_idx: 1 --> confirmed cases, 2 --> fatalities 
    # leaderboard_flag: 0 --> use data on and before 04/01/2020 to train, otherwise --> use all data in the training set
    
#     if leaderboard_flag == 0:
#         df = copy.copy(Features[(Features['Country_Region']==region) & (Features['Date'] < cutoff_date)])
#     else:
#         df = copy.copy(Features[Features['Country_Region']==region])
    
#     if region not in Features['Country_Region'].unique():
#         forecast = np.zeros(43)
    
#     if variable_idx == 1:
#         endog  = df['ConfirmedCases'].values
#         #order=(1,1,4)
#     elif variable_idx == 2:
#         endog  = df['Fatalities'].values
#         #order=(1,1,0)
    endog, missing_flag = slice_by_region_variable_leaderboard(region, variable_idx, leaderboard_flag, Feature_0)        
    
    try:
#         model = sm.tsa.statespace.SARIMAX(endog, order=order,
#                         measurement_error=True, initialization='approximate_diffuse')
        model = auto_arima(endog, trace=False, error_action='ignore', n_jobs=-1, maxiter=400, disp=-1, suppress_warnings=True)
#        fitted_model = model.fit(maxiter=400, method='powell', disp=False)
        model.fit(endog)
#         print(model.summary())
#        res = model.filter(fitted_model.params)
        forecast = model.predict(n_periods=days_to_predict)
    except:
        forecast = np.zeros(days_to_predict)
    return pd.Series(forecast)

# Arima Model with Square Root transform - Order selection with auto_arima

In [None]:
def predict_1(region, variable_idx, leaderboard_flag):
    # variable_idx: 1 --> confirmed cases, 2 --> fatalities 
    # leaderboard_flag: 0 --> use data on and before 04/01/2020 to train, otherwise --> use all data in the training set
    
#     if leaderboard_flag == 0:
#         df = copy.copy(Features_1[(Features_1['Country_Region']==region) & (Features_1['Date'] < cutoff_date)])
#     else:
#         df = copy.copy(Features_1[Features_1['Country_Region']==region])
    
#     if region not in Features['Country_Region'].unique():
#         forecast = np.zeros(43)
    
#     if variable_idx == 1:
#         endog  = df['ConfirmedCases'].values
#         #order=(7,1,0)
#     elif variable_idx == 2:
#         endog  = df['Fatalities'].values
#         #order=(1,1,0)
    endog, missing_flag = slice_by_region_variable_leaderboard(region, variable_idx, leaderboard_flag, Feature_1)
    
    try:
#         model = sm.tsa.statespace.SARIMAX(endog, order=order,
#                         measurement_error=True, initialization='approximate_diffuse')
        model = auto_arima(endog, trace=False, error_action='ignore', n_jobs=-1, maxiter=400, disp=-1, suppress_warnings=True)
#         fitted_model = model.fit(disp=False)
        model.fit(endog)
#         res = model.filter(fitted_model.params)
        forecast = model.predict(n_periods=days_to_predict)
    except:
        forecast = np.zeros(days_to_predict)
    print(region)
    return pd.Series(forecast).map(lambda x: x**2)

# Arima Model with Log Transform - Order selection with auto_arima

In [None]:
def predict_2(region, variable_idx, leaderboard_flag):
    # variable_idx: 1 --> confirmed cases, 2 --> fatalities 
    # leaderboard_flag: 0 --> use data on and before 04/01/2020 to train, otherwise --> use all data in the training set
    
#     if leaderboard_flag == 0:
#         df = copy.copy(Features_2[(Features_2['Country_Region']==region) & (Features_2['Date'] < cutoff_date)])
#     else:
#         df = copy.copy(Features_2[Features_2['Country_Region']==region])
    
#     if region not in Features['Country_Region'].unique():
#         forecast = np.zeros(43)
    
#     if variable_idx == 1:
#         endog  = df['ConfirmedCases'].values
#         #order=(7,1,0)
#     elif variable_idx == 2:
#         endog  = df['Fatalities'].values
#         #order=(1,1,0)
    endog, missing_flag = slice_by_region_variable_leaderboard(region, variable_idx, leaderboard_flag, Feature_2)
    
    try:
#         model = sm.tsa.statespace.SARIMAX(endog, order=order,
#                         measurement_error=True, initialization='approximate_diffuse')
        model = auto_arima(endog, trace=False, error_action='ignore', n_jobs=-1, maxiter=400, disp=-1, suppress_warnings=True)
#         fitted_model = model.fit(disp=False)
        model.fit(endog)
#         res = model.filter(fitted_model.params)
        forecast = model.predict(n_periods=days_to_predict)
    except:
        forecast = np.zeros(days_to_predict)
    print(region)
    return pd.Series(forecast).map(lambda x: np.exp(x) - 1)

# Multi-Level Logistic Model

In [None]:
def multi_level_logistic(x, t, y):
    return x[0]/(1 + np.exp(-x[1] * (t - x[2]))) - y

def multi_level_logistic_model(x, t):
    return x[0]/(1 + np.exp(-x[1] * (t - x[2])))

def guess_parameters(y, population, flag):
    # flag=0 --> first pass, flag=1 --> correction
    x = np.ones(3)
    middle_loc = int(np.abs(y - y.max()/2).argmin())
    perct_75_loc = int(np.abs(y - 3*y.max()/4).argmin())
    try:
        first_case_loc = np.where(y>1)[0][0]
    except:
        first_case_loc = -1
    if flag==0:
        x[0] = 1E-5*population
        x[2] =int(middle_loc)-7
    else:
        x[0] = 1E-5*population
        x[2] = first_case_loc + 21
    try:
        x[1] = 20/max(1, perct_75_loc - middle_loc) # estimation of rate of transmission
    except:
        x[1] = np.log(2)/2.
    return x

    
def fit_model(y_train, population, n):
    t_train = range(len(y_train))
    t_final = range(len(y_train), len(y_train) + n +1)
    x0 = guess_parameters(y_train, population, 0)
    res_robust_1 = least_squares(multi_level_logistic, x0, loss='soft_l1', f_scale=0.1, args=(t_train, y_train))
    fit_1 = multi_level_logistic_model(res_robust_1.x, t_train)
    residual = y_train - fit_1
    x0 = guess_parameters(residual, population, 1)
    res_robust_2 = least_squares(multi_level_logistic, x0, loss='cauchy', f_scale=0.1, args=(t_train, residual))
    fit_2 = multi_level_logistic_model(res_robust_2.x, t_train)
    final_forecast = multi_level_logistic_model(res_robust_1.x, t_final) + multi_level_logistic_model(res_robust_2.x, t_final)
    return final_forecast

def predict_3(region, variable_idx, leaderboard_flag):
    # region is a list of the Province_State and Country_Region
    # variable_idx: 1 --> cases, 2 --> fatalities
    # leaderboard_flag: 0 --> public (30 days), 1 --> private (43 days)
    
    if leaderboard_flag == 0:
        df = copy.copy(train[(train["Date"] < cutoff_date) & (train['Country_Region'] == region)])
        n = days_to_predict - number_of_overlap_days
    else:
        df = copy.copy(train[train['Country_Region'] == region])
        n = days_to_predict
    ydata = df.iloc[:, variable_idx + 2].values
    ynew = savgol_filter(ydata, 5, 3)
    xdata = range(len(ydata))
    t_final = range(len(ydata), len(ydata) + n)
    population = populations[populations['Country_Region'] == region].iloc[0,1]
    forecast = fit_model(ydata, population, n-1)
    if leaderboard_flag == 0:
        forecast = np.concatenate([ydata[-number_of_overlap_days:], forecast])
    return forecast

# Simple Fill Forward Model

In [None]:
def predict_4(region, variable_idx, leaderboard_flag):
    
    if variable_idx == 1:
        temp = train[train['Country_Region']==region]['ConfirmedCases'].values
    else:
        temp = train[train['Country_Region']==region]['Fatalities'].values
        
    if leaderboard_flag == 0:
        value =temp[-18]
    else:
        value = temp[-1]
        
    return np.ones(43)*value

# Testing Logistic Model Fit

In [None]:
eval_set[eval_set['Country_Region']==Unique_Regions[0]]

In [None]:
for i in range(0,10):
    t = range(0, 43)
    t2 = range(0, number_of_overlap_days)
    t2 = range(-number_of_overlap_days, 0)
    plt.plot(t, predict_0(Unique_Regions[i], 1, 1), label='Arima no transf')
    plt.plot(t, predict_1(Unique_Regions[i], 1, 1), label='Arima sqrt')
   # plt.plot(t, predict_2(Unique_Regions[i], 1, 1), label='Arima log')
    plt.plot(t, predict_3(Unique_Regions[i], 1, 1), label='Logistic')
    plt.plot(t, predict_4(Unique_Regions[i], 1, 1), label='Forward Fill')
    plt.plot(t2, eval_set[eval_set['Country_Region']==Unique_Regions[i]]['ConfirmedCases'].values, label='Eval Data')
    plt.legend()
    plt.show()

In [None]:
model_idxs_confirmed = []
model_idxs_fatalities = []
for region in Unique_Regions:
    actual_confirmed = eval_set[eval_set['Country_Region']==region]['ConfirmedCases'].values
    actual_fatalities = eval_set[eval_set['Country_Region']==region]['Fatalities'].values
    score1 = RMSLE(predict_1(region, 1, 1)[:13], actual_confirmed)
    score2 = RMSLE(predict_2(region, 1, 1)[:13], actual_confirmed)
    score3 = RMSLE(predict_3(region, 1, 1)[:13], actual_confirmed)
    score4 = RMSLE(predict_4(region, 1, 1)[:13], actual_confirmed)
    idx = np.argmin([score1, score2, score3, score4]) + 1
    model_idxs_confirmed.append(idx)
    score1 = RMSLE(predict_1(region, 2, 1)[:13], actual_fatalities)
    score2 = RMSLE(predict_2(region, 2, 1)[:13], actual_fatalities)
    score3 = RMSLE(predict_3(region, 2, 1)[:13], actual_fatalities)
    score4 = RMSLE(predict_4(region, 2, 1)[:13], actual_fatalities)
    idx = np.argmin([score1, score2, score3, score4]) + 1
    model_idxs_fatalities.append(idx)

# Create Submission

In [None]:
def predict_final(region, variable_idx, leaderboard_flag, model_id):
    if model_id == 1:
        return predict_1(region, variable_idx, leaderboard_flag)
    elif model_id == 2:
        return predict_2(region, variable_idx, leaderboard_flag)
    elif model_id == 3:
        return predict_3(region, variable_idx, leaderboard_flag)
    else:
        return predict_4(region, variable_idx, leaderboard_flag)

case_prediction = [predict_final(Unique_Regions[i], 1, 1, model_idxs_confirmed[i]) for i in range(len(Unique_Regions))]
confirmed_case_predictions = np.concatenate(case_prediction).astype(int)
test['ConfirmedCases'] = confirmed_case_predictions

fatality_prediction = [predict_final(Unique_Regions[i], 2, 1, model_idxs_fatalities[i]) for i in range(len(Unique_Regions))]
fatality_predictions = np.concatenate(fatality_prediction).astype(int)
test['Fatalities'] = fatality_predictions

submission = test[['ForecastId', 'ConfirmedCases', 'Fatalities']]
submission.to_csv('submission.csv', index=False)

In [None]:
submission

In [None]:
plt.plot(range(20), train[train['Country_Region'] == Unique_Regions[0]]['ConfirmedCases'].values[-20:])
plt.plot(range(1, 43), submission.iloc[:43, 1])

# Selecting Stable Regions

In [None]:
for region in Unique_Regions:
    full_set = pd.merge(train, test)
    pop = populations[populations['Country_Region'] == region].iloc[0,1]
    ydata = full_set[full_set['Country_Region'] == region]['ConfirmedCases'].values
    xdata = np.arange(0, len(ydata), 1)
    # np.diff(ydata[-7:]).sum() < 8 tells us when a country has little change (< 8 new cases) in the confirmed cases over the past 7 days
    # you can modify the value 7 to be over a different time span, you can also modify the value 8 to be more or less restrictive
    ##
    # ydata[-1:] > 20 tells us when a country has more than a certain number of cases
    ymax = ydata.max()
    if np.diff(ydata[-7:]).sum() < int(0.01*ymax) and ydata[-1:] > 20:
        print(region)
        plt.plot(ydata/pop)
        plt.show()

In [None]:
stable_countries = []
for region in Unique_Regions:
    ydata = train[train['Country_Region'] == region]['ConfirmedCases'].values
    xdata = np.arange(0, len(ydata), 1)
    # np.diff(ydata[-7:]).sum() < 8 tells us when a country has little change (< 8 new cases) in the confirmed cases over the past 7 days
    # you can modify the value 7 to be over a different time span, you can also modify the value 8 to be more or less restrictive
    ##
    # ydata[-1:] > 20 tells us when a country has more than a certain number of cases
    if np.diff(ydata[-7:]).sum() < 8 and ydata[-1:] > 20:
        stable_countries.append(region)
stable_countries

# Explore Number of Tests vs Number of Cases

In [None]:
covid_test = pd.read_csv('/kaggle/input/covid19testing/tested_worldwide.csv', index_col=0, parse_dates=['Date']).fillna(0).sort_values(by=['Country_Region', 'Date'])

In [None]:
covid_test

In [None]:
# for i in range(2, 10):
#     fig = plt.figure(figsize=(15, 8))
#     print(covid_test.columns[i])
#     plt.plot(covid_test[covid_test['Country_Region'] == 'Vietnam'].iloc[:, i])
# print('Calculated Cumulative')
plt.plot(covid_test[covid_test['Country_Region'] == 'Vietnam'].iloc[:, -3])
plt.plot(covid_test[covid_test['Country_Region'] == 'Vietnam'].iloc[:, -3].cummax())
plt.show()

# Checking Distributions of Parameter Guesses

In [None]:
def fit_model(y):
    t = range(len(y))
        # flag=0 --> first pass, flag=1 --> correction
    x = []
    middle_loc = int(np.abs(y - y.max()/2).argmin())
    perct_75_loc = int(np.abs(y - 3*y.max()/4).argmin())
    try:
        first_case_loc = np.where(y>1)[0][0]
    except:
        first_case_loc = -1
    if flag==0:
        x[0] = 1E-5*population
        x[2] =int(middle_loc)-7
    else:
        x[0] = 1E-5*population
        x[2] = first_case_loc + 21
    try:
        x[1] = 20/max(1, perct_75_loc - middle_loc) # estimation of rate of transmission
    except:
        x[1] = np.log(2)/2.
    return x
    x0 = guess_parameters(y_train, population, 0)
    res_robust_1 = least_squares(multi_level_logistic, x0, loss='soft_l1', f_scale=0.1, args=(t_train, y_train))
    fit_1 = multi_level_logistic_model(res_robust_1.x, t_train)
    residual = y_train - fit_1
    x0 = guess_parameters(residual, population, 1)
    res_robust_2 = least_squares(multi_level_logistic, x0, loss='cauchy', f_scale=0.1, args=(t_train, residual))
    fit_2 = multi_level_logistic_model(res_robust_2.x, t_train)
    final_forecast = multi_level_logistic_model(res_robust_1.x, t_final) + multi_level_logistic_model(res_robust_2.x, t_final)
    return final_forecast

In [None]:
fig = plt.figure(figsize=(20,10))
y = train[train['Country_Region'] == Unique_Regions[0]].iloc[:, 3]
x = np.arange(len(y))
plt.plot(x,y, color='r')
for i in range(15):
    plt.plot(x[i*15: (i+1)*16], y[i*15: (i+1)*16])
plt.show()

In [None]:
def convexity(y):
    slope = (y[-1] - y[0])/(len(y) - 1)
    x = np.arange(len(y))
    y_hat = x*slope + y[0]
    return sum(np.abs(y_hat-y)/max(1, y[-1] - y[0]))

def discrete_convexity_measure(y, m):
    n = len(y)
    p = int(n/m)
    remainder = n%m
    convexities = [convexity(y[i * m: (i + 1) * m + 1 + (i==(p-1))*remainder]) for i in range(p)]
    convexity_series = np.concatenate([convexities[i]*np.ones(m + (i==p-1)*remainder) for i in range(p)])
    return convexity_series

In [None]:
discrete_convexity_measure(y.values, 10)

In [None]:
result = np.zeros(93)
for i in range(10, 25):
    result += np.abs(np.diff(discrete_convexity_measure(y.values, i)))

plt.plot(result)
plt.show()
plt.plot(y)
plt.show()

In [None]:
len(discrete_convexity_measure(y.values, 12))