# Tabular Playground Series - Jan 2022 with ETNA 🌋

In [None]:
!pip install etna==1.5.0 --ignore-installed -q 2> /dev/null

<a href="https://github.com/tinkoff-ai/etna">
    <img src="https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white"  align='left'>
</a>


In this notebook we will make predictions for [Tabular Playground Series - Jan 2022](https://www.kaggle.com/c/tabular-playground-series-jan-2022) with [etna time series library](https://github.com/tinkoff-ai/etna/).

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
TRAIN_PATH = "../input/tabular-playground-series-jan-2022/train.csv"
TEST_PATH = "../input/tabular-playground-series-jan-2022/test.csv"
GDP_PATH = "../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv"
HORIZON = 365

# Dataset

Working with the ETNA library requires the usage of **TSDataset** - the special structure that holds many time series. Before creating the dataset, we need to prepare the raw data.

In [None]:
from etna.datasets import TSDataset

## Competition data

First of all, we need to:
1. Load dataset
2. Rename columns to fit the ETNA format:
    * `timestamp` - column with time variable 
    * `segment`- column with indicator of individual time series within dataset
    * `target` - column with target variable

In [None]:
def load_data():
    def load(path):
        data = pd.read_csv(path)
        data = data.drop(columns=["row_id"])
        data = data.rename(columns={"date":"timestamp", "num_sold":"target"})
        data["segment"] = data["country"] + "_" + data["store"] + "_" + data["product"]
        data["timestamp"] = pd.to_datetime(data["timestamp"])
        return data
    train = load(TRAIN_PATH)
    test = load(TEST_PATH)
    return train, test

In [None]:
train, test = load_data()
train.head()

## Target

Let's use [dataset](https://www.kaggle.com/carlmcbrideellis/gdp-20152019-finland-norway-and-sweden) with yearly **Gross Domestic Product(GDP)** values to transform the target.

In [None]:
def transform_target_gdp(df, inverse=False):
    # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation
    gdp_exponent = 1.2121103201489674 
    gdp_df = pd.read_csv(GDP_PATH, index_col='year')
    def get_gdp(row):
        """Return the GDP based on row.country and row.timestamp.year"""
        country = 'GDP_' + row.country
        return gdp_df.loc[row.timestamp.year, country] ** gdp_exponent
    if inverse:
        df["target"] *= df.apply(get_gdp, axis=1)
    else:
        df["target"] /= df.apply(get_gdp, axis=1)
    return df

def prepare_target():
    df = train.copy()
    df = transform_target_gdp(df)
    df = df[["timestamp","segment","target"]]
    df = TSDataset.to_dataset(df)
    return df

In [None]:
df = prepare_target()
df.head()

## Feature engineering
Now, we need to prepare the exogenous data and do some simple feature engineering 

In [None]:
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    df["dayofyear"] = df.timestamp.dt.dayofyear
    leap_years_fix_timestamps = (df.timestamp.dt.year != 2016) & (df.timestamp.dt.month >=3) 
    df.loc[leap_years_fix_timestamps, 'dayofyear'] += 1 
    
    # Easter
    import dateutil.easter as easter

    easter_timestamp = df.timestamp.apply(
        lambda timestamp: pd.Timestamp(easter.easter(timestamp.year))
    )
    df['days_from_easter'] = (df.timestamp - easter_timestamp).dt.days.clip(-3, 59)
    df.loc[df['days_from_easter'].isin(range(12, 39)), 'days_from_easter'] = 12 
    
    # Last Wednesday of June
    wed_june_timestamp = df.timestamp.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.timestamp - wed_june_timestamp).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_timestamp = df.timestamp.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.timestamp - sun_nov_timestamp).dt.days.clip(-1, 9)
    
    return df

def prepare_exog():
    df_exog = pd.concat([train, test]).drop(columns=["target"])
    df_exog = engineer(df_exog)
    
    categorical_features = ["country", "product", "store"]
    df_exog[categorical_features] = df_exog[categorical_features].astype("category")
    df_exog = df_exog.add_prefix("regressor_")
    df_exog = df_exog.rename(columns={"regressor_segment":"segment",
                                      "regressor_timestamp":"timestamp"})
    
    df_exog = TSDataset.to_dataset(df_exog)
    return df_exog

It is important to remember to simple ideas:
1. All the columns in the exogenous data known for the future are **regressors**.(requires prefix "regressor_" to indicate it)
2. All categorical features should have type "category" to be handled correctly

In [None]:
df_exog = prepare_exog()
df_exog.head()

## TSDataset

Finally, we can create TSDataset!

In [None]:
def create_dataset():
    train, test = load_data()
    df = prepare_target()
    df_exog = prepare_exog()
    ts = TSDataset(df=df, freq="D", df_exog=df_exog)
    return ts

In [None]:
ts = create_dataset()
ts.head()

Let's take a look at the time series in the dataset

In [None]:
ts.plot()

# Solution



## Model
We will use one Catboost model for all the segments in the dataset. The separate model for each segment(CatBoostModelPerSegment) showed worse results.

In [None]:
from etna.models import CatBoostModelMultiSegment

model = CatBoostModelMultiSegment()

## Transforms

Transforms define the preprocessing and feature engineering steps. They are applied one by one. Don't forget about the prefix for the regressors!

In [None]:
from etna.transforms import LogTransform, StandardScalerTransform # math
from etna.transforms import DateFlagsTransform, HolidayTransform # datetime
from etna.transforms import LagTransform # lags

transforms = [LogTransform(in_column="target"),
              StandardScalerTransform(in_column="target", mode="per-segment"),
              DateFlagsTransform(day_number_in_week=False, day_number_in_month=False,
                                 is_weekend=True, special_days_in_week=[4],
                                 out_column="regressor_date_flag"),
              HolidayTransform(iso_code="SWE", out_column="regressor_SWE_holidays"),
              HolidayTransform(iso_code="NOR", out_column="regressor_NOR_holidays"),
              HolidayTransform(iso_code="FIN", out_column="regressor_FIN_holidays"),
              LagTransform(in_column="regressor_SWE_holidays", lags=list(range(5,6)), out_column="regressor_SWE_holidays_lag"),
              LagTransform(in_column="regressor_NOR_holidays", lags=list(range(2,6)), out_column="regressor_NOR_holidays_lag"),
              LagTransform(in_column="regressor_FIN_holidays", lags=list(range(2,6)), out_column="regressor_FIN_holidays_lag"),]

## Pipeline
This is the main framework to evaluate the model and get forecasts.

In [None]:
from etna.pipeline import Pipeline

pipeline = Pipeline(model=model, transforms=transforms, horizon=HORIZON)

### Evaluation(backtest)

To evaluate the resulting pipeline, we are going to run a **backtest** - special cross validation for time series that takes into account ordering by timestamp. The base idea is simple: don't validate on the past.

In [None]:
from etna.metrics import SMAPE

metrics, forecasts, _ = pipeline.backtest(ts, metrics=[SMAPE()], aggregate_metrics=True, n_folds=3)

Let's plot the backtest results and look at the metric

In [None]:
from etna.analysis import plot_backtest

In [None]:
print("SMAPE(mean): ",metrics.mean()["SMAPE"])

In [None]:
plot_backtest(forecasts, ts, history_len=HORIZON)

### Feature importance

It is also easy to get the feature importance values from the underlying model. Let's take a look

In [None]:
def plot_feature_importance(importance, names, top_k=None):
    if top_k is None:
        top_k = len(names)
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    fi_df = pd.DataFrame({'feature_names' : feature_names,
                          'feature_importance' : feature_importance})
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'][:top_k], y=fi_df['feature_names'][:top_k])
    plt.title('FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
ts = create_dataset()
pipeline.fit(ts)
model = pipeline.model._base_model.model
plot_feature_importance(model.get_feature_importance(), model.feature_names_)

# Forecasting

Now, let's build the final solution. We will use the ensemble of Catboost models with different random seeds, to make the forecast more robust.

## Ensemble

In [None]:
from etna.ensembles import VotingEnsemble

Hyperparameters from optuna:

In [None]:
params = [
{
 'depth': 5,
 'iterations': 1147,
 'l2_leaf_reg': 0.779126455221549,
 'random_strength': 8.637529894850367,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.8127462916337822,
},
{
 'depth': 5,
 'iterations': 1147,
 'l2_leaf_reg': 0.779126455221549,
 'random_strength': 8.637529894850367,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.8127462916337822,
},
{
 'depth': 6,
 'iterations': 1174,
 'l2_leaf_reg': 0.0443925684503373,
 'random_strength': 8.864563894739499,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.7263194054569034
},
{
 'depth': 6,
 'iterations': 1174,
 'l2_leaf_reg': 0.0443925684503373,
 'random_strength': 8.864563894739499,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.7263194054569034
},
{
 'depth': 5,
 'iterations': 1170,
 'l2_leaf_reg': 0.15106766436681104,
 'random_strength': 8.857637446656524,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.53404545695121
},
{
 'depth': 5,
 'iterations': 1193,
 'l2_leaf_reg': 0.06992597880991089,
 'random_strength': 8.889861067995088,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.6960270472983301
},
{
 'depth': 5,
 'iterations': 1170,
 'l2_leaf_reg': 0.15106766436681104,
 'random_strength': 8.857637446656524,
 'one_hot_max_size': 5,
 'bagging_temperature': 0.53404545695121
}]

Random seeds from my head:

In [None]:
seeds = [None, 13, 121, 11041999, 3141, 235813, 1501]

And the ensemble is ready!

In [None]:
pipelines = [Pipeline(model=CatBoostModelMultiSegment(random_seed=seeds[i], **params[i]),
                      transforms=transforms,
                      horizon=HORIZON) 
             for i in range(len(seeds))]
ensemble = VotingEnsemble(pipelines=pipelines, n_jobs=5)

In [None]:
metrics, forecasts, _ = ensemble.backtest(ts, metrics=[SMAPE()], aggregate_metrics=True, n_folds=3)

Slightly better :)

In [None]:
print("SMAPE(mean): ",metrics.mean()["SMAPE"])

## Forecasts

In [None]:
from etna.analysis import plot_forecast

In [None]:
ts = create_dataset()
ensemble.fit(ts)
future = ensemble.forecast()

Let's take a look at the forecast to check that everything is fine

In [None]:
plot_forecast(forecast_ts=future, train_ts=ts)

# Submission

In [None]:
def prepare_submission():
    test = pd.read_csv(TEST_PATH)
    test["date"] = pd.to_datetime(test["date"])
    test = test.rename(columns={"date":"timestamp"})
    
    df = TSDataset.to_flatten(future.df)
    df["country"] = df["segment"].str.split("_").apply(lambda x: x[0])
    df["store"] = df["segment"].str.split("_").apply(lambda x: x[1])
    df["product"] = df["segment"].str.split("_").apply(lambda x: x[2])
    
    df = transform_target_gdp(df, inverse=True)
    df = pd.merge(df, test, on=["timestamp","country","store","product"])
    df = df.rename(columns = {"target":"num_sold"})
    df = df.sort_values(by=["row_id"])
    df = df[["row_id","num_sold"]]
    return df

In [None]:
submission = prepare_submission()

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)