This notebook is a stacking of submissions from other 2 well known notebooks:
1. TimeSeries Forecasting with Darts Global Models
https://www.kaggle.com/code/ferdinandberr/timeseries-forecasting-with-darts-global-models
2. Guide: External Data&Features for MultivariateTS
https://www.kaggle.com/code/romaupgini/guide-external-data-features-for-multivariatets


The attached 10 submissions are all some kinds of improvements of the original Upgini notebook (2). <br/>
The solution with best Upgini score is the `submission_39261.csv`, the Darts solution is the `submission_38558.csv` with the best score over all.<br/>
In this notebook I tried to combine both Upgini and Darts submissions and got even better score.

In [None]:
import numpy as np
import pandas as pd
from random import random
import os

from sklearn.metrics import mean_squared_log_error as msle
from dateutil.relativedelta import relativedelta
import statsmodels.api as sm
# visualization tools
from matplotlib import pyplot as plt, style
style.use('seaborn-darkgrid')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
from tqdm import tqdm

import gc
gc.enable()
from warnings import filterwarnings, simplefilter
filterwarnings('ignore')
simplefilter('ignore')

### Download Upgini and Darts notebooks

In [None]:
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv', parse_dates = ['date'], infer_datetime_format = True, index_col=['id'])
test['date'] = test.date.dt.to_period('D')

df_pred = test.copy()
for dirname, _, filenames in os.walk('/kaggle/input/store-sales-submissions'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        df_pred['sub_'+filename.split('_')[1].split('.')[0]] = pd.read_csv(os.path.join(dirname, filename), index_col=['id']).sales

df_pred.head()

In [None]:
# all submissions are highly correlated

res = df_pred.drop(['store_nbr','onpromotion'], axis=1).groupby('family').mean()
res_corr = res.corr(method="spearman")
sns.heatmap(res_corr, annot=True, fmt='.1f', cmap='coolwarm', square=True, mask=np.triu(res_corr), linewidths=1, cbar=False)

In [None]:
# Upgini only notebooks
pred_cols = ['sub_39331','sub_39359','sub_39274','sub_39369','sub_39347','sub_39263','sub_39281','sub_39261','sub_39262','sub_39266']

def func(dt):
    pred_res = pd.Series()
    for col in pred_cols:
        pred_res[col] = np.sqrt(np.sqrt(msle(dt[col], dt['sub_38558'])))
    return pred_res


# "min" - is the column name with the predicted values closest to the best Darts "38558" submission
res = df_pred.groupby('family').apply(func)
res['min'] = res.idxmin(axis=1)
res

As we see the best `38558` solution is the closest one not to the second solution `39261` but to the other 2 solutions which have ones of the lowest scores in the dataset.

In [None]:
res['min'].value_counts()

In [None]:
res = df_pred.groupby('date')[['sub_38415','sub_39331','sub_39359','sub_38558']].mean()
res.plot(figsize=(20,10))

In [None]:
# stack the closest 3 submissions ['sub_39331','sub_39359','sub_38558'] - has the best score!

sub = pd.DataFrame(df_pred[['sub_39331','sub_39359','sub_38558']].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_38558.csv', index = False)

In [None]:
# stack the closest 3 submissions ['sub_38415','sub_39331','sub_39359'] - the previous best score with 2 other closest solutions

sub = pd.DataFrame(df_pred[['sub_38415','sub_39331','sub_39359']].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_38415.csv', index = False)

In [None]:
# stack the 2 best submissions

sub = pd.DataFrame(df_pred[['sub_38415','sub_38558']].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission.csv', index = False)

In [None]:
# stack all Upgini submissions - not very well

# Upgini and Darts notebooks
sub_cols = pred_cols + ['sub_38558']

sub = pd.DataFrame(df_pred[sub_cols].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_mean.csv', index = False)

In [None]:
# stack all existing submissions - also not very well

sub = pd.DataFrame(df_pred[pred_cols].mean(axis=1), columns=['sales']).reset_index()
sub.to_csv('submission_pred.csv', index = False)