# Introduction

Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col = 'row_id')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id')

In [None]:
display(train['country'].unique())
display(train['store'].unique())
display(train['product'].unique())

# Data Preparation

In [None]:
def get_ts_dict(df):
    country_list = ['Finland', 'Norway', 'Sweden']
    store_list = ['KaggleMart', 'KaggleRama']
    product_list = ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']

    time_series_dict = {}
    for country in country_list:
        for store in store_list:
            for product in product_list:
                selected_pd = df.loc[(df['country'] == country) & (df['store'] == store) & 
                                     (df['product'] == product), ['date', 'num_sold']]
                key = country + '_' + store + '_' + product
                time_series_dict[key] = selected_pd
    return time_series_dict

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

ts_dict = get_ts_dict(train)
key_list = list(ts_dict.keys())

In [None]:
def ts_plot(ts_dict, key_list, figsize=(24, 24)):
    import datetime as dt
    
    ncols = 2
    nrows = round(len(key_list) / ncols)
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    plt.subplots_adjust(hspace=1.5)
    
    index = 0
    for row in range(nrows):
        for col in range(ncols):
            try:
                key = key_list[index]
                df = ts_dict[key]
            except:
                axes[row][col].set_visible(False)
                index += 1
                continue
            
            sns.lineplot(data=df, x='date', y='num_sold', ax=axes[row][col])
            axes[row][col].set_title(key)
            x_label = pd.date_range(start=min(df['date']), end=max(df['date']), freq='3M').astype(str)
            axes[row][col].set_xticks(x_label)
            axes[row][col].set_xticklabels(x_label, rotation=90)
            for year in df['date'].dt.year:
                axes[row][col].axvline(x = dt.datetime(year, 12, 30), ymin = 0, ymax = 1,
                                       color ='red', linestyle='--')
            index += 1
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ts_plot(ts_dict, key_list)

# Time Series Analysis

In [None]:
def plot_periodogram(ts, detrend='linear', ax=None, title=''):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=90,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram - " + title)
    return ax

In [None]:
df_finland_mart_mug = ts_dict['Finland_KaggleMart_Kaggle Mug'].copy()
df_finland_mart_hat = ts_dict['Finland_KaggleMart_Kaggle Hat'].copy()
df_finland_mart_sticker = ts_dict['Finland_KaggleMart_Kaggle Sticker'].copy()

In [None]:
import warnings

warnings.filterwarnings("ignore")

fig, axes = plt.subplots(3, 1, figsize=(16, 8))
plt.subplots_adjust(hspace=1.5)
plot_periodogram(df_finland_mart_mug['num_sold'], ax=axes[0], title='finland_mart_mug')
plot_periodogram(df_finland_mart_hat['num_sold'], ax=axes[1], title='finland_mart_hat')
plot_periodogram(df_finland_mart_sticker['num_sold'], ax=axes[2], title='finland_mart_sticker')
plt.show()

## Mug

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

fourier_order = 1
fourier = CalendarFourier(freq="A", order=fourier_order)

dp = DeterministicProcess(
    index=df_finland_mart_mug['date'],
    constant=False,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=True,               # weekly seasonality (indicators)
    additional_terms=[fourier],  # annual seasonality (fourier)
    drop=True                   # drop terms to avoid collinearity
)

train_ts_mug = dp.in_sample()
train_ts_mug

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
sns.lineplot(x=df_finland_mart_mug['date'], y=df_finland_mart_mug['num_sold'], label='mug', ax=ax)

train_ts_mug['total_wave'] = 0
for i in range(fourier_order):
    train_ts_mug['total_wave'] = (train_ts_mug['total_wave'] + 
                                  train_ts_mug[f'sin({i+1},freq=A-DEC)'] + train_ts_mug[f'cos({i+1},freq=A-DEC)'])
train_ts_mug['total_wave'] *= 20
train_ts_mug['total_wave'] +=200

shift_num = 38
train_ts_mug['total_wave'] = train_ts_mug['total_wave'].shift(-shift_num)
sns.lineplot(x=train_ts_mug.index, y=train_ts_mug['total_wave'], label='ts', ax=ax, linewidth=3)

plt.show()

## Hat

In [None]:
fourier_order = 1
fourier = CalendarFourier(freq="A", order=fourier_order)

dp = DeterministicProcess(
    index=df_finland_mart_hat['date'],
    constant=False,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=True,               # weekly seasonality (indicators)
    additional_terms=[fourier],  # annual seasonality (fourier)
    drop=True                   # drop terms to avoid collinearity
)

train_ts_hat = dp.in_sample()

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
sns.lineplot(x=df_finland_mart_hat['date'], y=df_finland_mart_hat['num_sold'], label='hat', ax=ax)

train_ts_hat['total_wave'] = 0
for i in range(fourier_order):
    train_ts_hat['total_wave'] = (train_ts_hat['total_wave'] + 
                                  train_ts_hat[f'sin({i+1},freq=A-DEC)'] + train_ts_hat[f'cos({i+1},freq=A-DEC)'])
train_ts_hat['total_wave'] *= 50
train_ts_hat['total_wave'] +=350

train_ts_hat['total_wave'] = train_ts_hat['total_wave'].shift(shift_num)
sns.lineplot(x=train_ts_hat.index, y=train_ts_hat['total_wave'], label='ts', ax=ax, linewidth=3)

plt.show()

# Feature Engineering

In [None]:
def feature_eng(df):
    import holidays
    import datetime
    
    #### Date
    df['date'] = pd.to_datetime(df['date'])
    df['week']= df['date'].dt.week
    df['year'] = 'Y' + df['date'].dt.year.astype(str)
    df['quarter'] = 'Q' + df['date'].dt.quarter.astype(str)
    df['day'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    df['weekend'] = df['date'].dt.weekday >=5
    df['weekday'] = 'WD' + df['date'].dt.weekday.astype(str)
    df['month']= 'M' + df['date'].dt.month.astype(str)
    
    #### Peak
    df['MM-DD'] = df['date'].dt.strftime('%m-%d')
    df['peak'] = 0
    df.loc[df['MM-DD'] == '12-26', 'peak'] = 0.25
    df.loc[df['MM-DD'] == '12-27', 'peak'] = 0.50
    df.loc[df['MM-DD'] == '12-28', 'peak'] = 0.75
    df.loc[df['MM-DD'] == '12-29', 'peak'] = 1
    df.loc[df['MM-DD'] == '12-30', 'peak'] = 1
    df.loc[df['MM-DD'] == '12-31', 'peak'] = 1
    df.loc[df['MM-DD'] == '01-01', 'peak'] = 0.75
    df.loc[df['MM-DD'] == '01-02', 'peak'] = 0.50
    df.loc[df['MM-DD'] == '01-03', 'peak'] = 0.25
    
    #### Till The Next Holiday
    def get_country_holidays(country, years_list):
        festivities = holidays.CountryHoliday(country, years=years_list)
        festivities_df = pd.DataFrame.from_dict(festivities, orient='index').reset_index().rename(columns={'index':'date', 0:'festivity_name'})
        festivities_df['date'] = pd.to_datetime(festivities_df['date'])
        if country == 'Sweden':
            festivities_df = festivities_df[festivities_df['festivity_name']!='Söndag']

        additional_dates = [[pd.to_datetime(f'{year}-12-24'), 'Christmas Eve'] for year in years_list]
        additional_dates += [[pd.to_datetime(f'{year}-12-29'), 'Peak in sales 1/2'] for year in years_list]
        additional_dates += [[pd.to_datetime(f'{year}-12-30'), 'Peak in sales 2/2'] for year in years_list]
        additional_dates += [[pd.to_datetime(f'{year}-12-31'), 'Saint Sylvester'] for year in years_list]
        additional_dates += [[pd.to_datetime(f'{year}-01-01'), 'New Year'] for year in years_list]
        additional_festivities_df = pd.DataFrame(additional_dates, columns=['date', 'festivity_name'])    

        festivities_df = festivities_df.append(additional_festivities_df, ignore_index=True)
        return festivities_df.sort_values('date')

    def days_till_next_holiday(country, date):
        country_holidays_dates = get_country_holidays(country, [date.year, date.year+1])['date']
        next_date = min([holidays_date for holidays_date in country_holidays_dates if holidays_date >= date])
        return (next_date - date).days
    
    df['days_till_next_holiday'] = df.apply(lambda x: days_till_next_holiday(x['country'], x['date']), axis=1)
    
    #### Seasonality
    date_range = pd.date_range(start=min(df['date'] + datetime.timedelta(days=-60)), 
                               end=max(df['date'] + datetime.timedelta(days=60)), freq='D')
    fourier = CalendarFourier(freq="A", order=1)
    dp = DeterministicProcess(
            index=date_range,
            constant=False,               # dummy feature for bias (y-intercept)
            order=1,                     # trend (order 1 means linear)
            seasonal=True,               # weekly seasonality (indicators)
            additional_terms=[fourier],  # annual seasonality (fourier)
            drop=True                   # drop terms to avoid collinearity
    )
    ts_features = dp.in_sample()
    ts_features['wave'] = ts_features['sin(1,freq=A-DEC)'] + ts_features['cos(1,freq=A-DEC)']
    ts_features['wave_mug'] = ts_features['wave'].shift(-shift_num)
    ts_features['wave_mug_lag1'] = ts_features['wave'].shift(-shift_num + 1)
    ts_features['wave_mug_lag2'] = ts_features['wave'].shift(-shift_num + 2)
    ts_features['wave_mug_lag3'] = ts_features['wave'].shift(-shift_num + 3)
    ts_features['wave_hat'] = ts_features['wave'].shift(shift_num)
    ts_features['wave_hat_lag1'] = ts_features['wave'].shift(shift_num + 1)
    ts_features['wave_hat_lag2'] = ts_features['wave'].shift(shift_num + 2)
    ts_features['wave_hat_lag3'] = ts_features['wave'].shift(shift_num + 3)
    ts_features.drop(['sin(1,freq=A-DEC)', 'cos(1,freq=A-DEC)'], inplace=True, axis=1)
    df = df.merge(ts_features, left_on='date', right_index=True)
    
    #### GDP
    gdp_df = pd.read_csv('../input/gdp-data-2014-to-2019-finland-norway-sweden/GDP_data_2014_to_2019_Finland_Norway_Sweden.csv', sep=';')
    gdp_df.columns = ['year', 'Finland', 'Norway', 'Sweden']

    gdp_melt_df = pd.melt(gdp_df, id_vars=['year'], value_vars=['Finland', 'Norway', 'Sweden'], var_name='country', value_name='gdp')
    gdp_melt_df['year'] = 'Y' + gdp_melt_df['year'].astype(str)
    df = df.merge(gdp_melt_df, how='left', on=['year', 'country'])
    
    #### DGP Percentage Change Between Years
    gdp_df['Finland'] = gdp_df['Finland'].pct_change()
    gdp_df['Norway'] = gdp_df['Norway'].pct_change()
    gdp_df['Sweden'] = gdp_df['Sweden'].pct_change()
    
    gdp_melt_df = pd.melt(gdp_df, id_vars=['year'], value_vars=['Finland', 'Norway', 'Sweden'], var_name='country', value_name='gdp_pct')
    gdp_melt_df['year'] = 'Y' + gdp_melt_df['year'].astype(str)
    df = df.merge(gdp_melt_df, how='left', on=['year', 'country'])
    
    df.drop(columns=['date', 'MM-DD'],inplace=True) 
    return df

In [None]:
train_test = pd.concat([train, test])
train_test = feature_eng(train_test)

train_new = train_test.iloc[:len(train), :]
test_new = train_test.iloc[len(train):, :]
test_new.drop(['num_sold'], axis=1, inplace=True)

In [None]:
train_new

# Modeling

In [None]:
%%capture
!pip install pycaret[full]

In [None]:
NUM_FEATURES = list(test_new.loc[:,test_new.dtypes==np.int].columns)
NUM_FEATURES_2 = list(test_new.loc[:,test_new.dtypes==np.float].columns)
NUM_FEATURES.extend(NUM_FEATURES_2)

FEATURES = list(test_new.columns)
CAT_FEATURES = [feature for feature in FEATURES if feature not in NUM_FEATURES]

print(CAT_FEATURES)
print(NUM_FEATURES)

In [None]:
from pycaret.regression import *

reg = setup(data = train_new,
            target = 'num_sold',
            normalize = True, #normalisation helps some algorithms
            normalize_method = 'robust', #resilient to outliers
            transform_target = True, #applies transformation to target column
            transform_target_method = 'box-cox',
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            feature_interaction = True,
            categorical_features = CAT_FEATURES,
            numeric_features = NUM_FEATURES,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold = 10,
            n_jobs = -1)

In [None]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better=False)

In [None]:
N = 3
top = compare_models(sort = 'SMAPE', n_select = N)

In [None]:
tuned_top = [tune_model(i, optimize = 'SMAPE', choose_better=True, n_iter=100) for i in top]

In [None]:
blend = blend_models(tuned_top, optimize='SMAPE')
predict_model(blend);

In [None]:
final_blend = finalize_model(blend)
predict_model(final_blend);

In [None]:
plot_model(final_blend, plot='error')

In [None]:
import gc
gc.collect()
unseen_predictions_blend = predict_model(final_blend, data=test_new)
unseen_predictions_blend.head()

In [None]:
gc.collect()

assert(len(test.index)==len(unseen_predictions_blend))

sub = pd.DataFrame(list(zip(test.index, unseen_predictions_blend.Label)),columns = ['row_id', 'num_sold'])

sub.to_csv('submission.csv', index = False)

print(sub)