## Basic RNN2RNN Model

- test data : [Kaggle m5-forecasting-accuracy competition first **1000** item sale series](https://www.kaggle.com/c/m5-forecasting-accuracy/overview)
- metric and loss function: RMSE

In [1]:
import os
import gc
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from  datetime import datetime, timedelta
import scipy as sp
%matplotlib inline

import torch
from torch.optim import Adam
from torch.nn import MSELoss

from deepseries.models import BasicSeq2Seq
from deepseries.train import Learner
from deepseries.dataset import TimeSeries, Property, Seq2SeqDataLoader
from deepseries.nn.loss import RMSELoss

In [4]:
DIR = "../data"
N_ROWS = 1000
BATCH_SIZE = 32


LAGS = [365]
MAX_LAGS = max(LAGS)
DROP_BEFORE = 500


ENC_LEN = 365 * 2 + 1
DEC_LEN = 28

VALID_LEN = 28
TEST_LEN = 28

TRAIN_LAST_DAY = 1913
USE_SERIES_LEN = TRAIN_LAST_DAY - DROP_BEFORE + 1 + 28

CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "int16", 'snap_TX': 'int16', 'snap_WI': 'int16' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

# FIRST_DAY = datetime(2011, 1, 29) 
# FORECAST_DAY = datetime(2016,4, 25) 

In [5]:
def load_data():
    label_encoders = {}
    
    prices = pd.read_csv(os.path.join(DIR, "sell_prices.csv"), dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            if col not in label_encoders:
                label_encoders[col] = LabelEncoder().fit(prices[col].astype(str).fillna("None"))
            prices[col] = label_encoders[col].transform(prices[col].astype(str).fillna("None")).astype("int16")
    
    cal = pd.read_csv(os.path.join(DIR, "calendar.csv"), dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            if col not in label_encoders:
                label_encoders[col] = LabelEncoder().fit(cal[col].astype(str).fillna("None"))
            cal[col] = label_encoders[col].transform(cal[col].astype(str).fillna("None")).astype("int16")
    
    start_day = max(1, DROP_BEFORE-MAX_LAGS)
    numcols = [f"d_{day}" for day in range(start_day, TRAIN_LAST_DAY+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(os.path.join(DIR, "sales_train_validation.csv"), 
                     usecols = catcols + numcols, dtype = dtype, nrows=N_ROWS)
    
    for col in catcols:
        if col != "id":
            if col not in label_encoders:
                label_encoders[col] = LabelEncoder().fit(dt[col].astype(str).fillna("None"))
            dt[col] = label_encoders[col].transform(dt[col].astype(str).fillna("None")).astype("int16")

    for day in range(TRAIN_LAST_DAY+1, TRAIN_LAST_DAY+28+1):
        dt[f"d_{day}"] = np.nan
    
    product = dt[catcols].copy()
    print(f"product shape {product.shape}")

    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")

    dt = dt.merge(cal[['d', 'wm_yr_wk']], on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    dt['d'] = dt['d'].str.replace('d_', '').astype("int32")
    price = dt.pivot(index="id", columns="d", values="sell_price")
    xy = dt.pivot(index="id", columns="d", values="sales")
    del dt; gc.collect()
    print(f"sale_xy shape {xy.shape}")
    print(f"price shape {price.shape}")

    cal_use_col = ['date', 'wday', 'month', 'year', 'event_name_1',
    'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
    'snap_WI']
    cal = cal[cal_use_col]
    cal['year'] = cal['year'] - cal['year'].min()
    cal['quarter'] = cal.date.dt.quarter
    cal = cal.drop("date", axis=1).T
    cal = cal[xy.columns]
    print(f"calendar shape {cal.shape}")
    
    submission = pd.read_csv(os.path.join(DIR, "sample_submission.csv"))
    print(f"submisson shape {submission.shape}")
    return xy, price, cal, product, submission

df_series, df_price, df_calendar, df_product, df_sub = load_data()

product shape (1000, 6)
sale_xy shape (1000, 1807)
price shape (1000, 1807)
calendar shape (11, 1807)
submisson shape (60980, 29)


In [6]:
df_series.head()

d,135,136,137,138,139,140,141,142,143,144,...,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_validation,,,,,,,,,,,...,,,,,,,,,,
HOBBIES_1_002_CA_1_validation,,,,,,,0.0,0.0,0.0,1.0,...,,,,,,,,,,
HOBBIES_1_003_CA_1_validation,,,,,,,,,,,...,,,,,,,,,,
HOBBIES_1_004_CA_1_validation,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,...,,,,,,,,,,
HOBBIES_1_005_CA_1_validation,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,7.0,...,,,,,,,,,,


In [7]:
df_price.head()

d,135,136,137,138,139,140,141,142,143,144,...,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_validation,,,,,,,,,,,...,8.38,8.38,8.38,8.38,8.38,8.38,8.38,8.38,8.38,8.38
HOBBIES_1_002_CA_1_validation,,,,,,,3.97,3.97,3.97,3.97,...,3.97,3.97,3.97,3.97,3.97,3.97,3.97,3.97,3.97,3.97
HOBBIES_1_003_CA_1_validation,,,,,,,,,,,...,2.97,2.97,2.97,2.97,2.97,2.97,2.97,2.97,2.97,2.97
HOBBIES_1_004_CA_1_validation,4.34,4.34,4.34,4.34,4.34,4.34,4.34,4.34,4.34,4.34,...,4.64,4.64,4.64,4.64,4.64,4.64,4.64,4.64,4.64,4.64
HOBBIES_1_005_CA_1_validation,2.98,2.98,2.98,2.98,2.98,2.98,2.98,2.98,2.98,2.98,...,2.88,2.88,2.88,2.88,2.88,2.88,2.88,2.88,2.88,2.88


In [10]:
df_calendar.head()

Unnamed: 0,135,136,137,138,139,140,141,142,143,144,...,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941
wday,3,4,5,6,7,1,2,3,4,5,...,1,2,3,4,5,6,7,1,2,3
month,6,6,6,6,6,6,6,6,6,6,...,5,5,5,5,5,5,5,5,5,5
year,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
event_name_1,30,30,30,30,30,30,7,30,30,30,...,30,30,30,30,30,30,30,30,30,30
event_type_1,4,4,4,4,4,4,0,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [11]:
# series
series = np.log1p(df_series.values)
price = np.log1p(df_price.values)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# series state
series_nan = np.isnan(series).astype("int8")
series_zero = (series == 0).astype("int8")

In [13]:
# series statistics

series_valid_masked = np.ma.masked_array(series, mask=series_nan.astype(bool))

series_mean = series_valid_masked.mean(axis=1).data
series_std = series_valid_masked.std(axis=1).data
series_skew = sp.stats.mstats.skew(series_valid_masked, axis=1).data
series_kurt = np.clip(sp.stats.mstats.kurtosis(series_valid_masked, axis=1).data, None, 10)

In [None]:
# series normalization

series = np.nan_to_num((series - np.expand_dims(series_mean, 1)) / (np.expand_dims(series_std, 1) + 1e-7), 0.).astype("float32")

In [None]:
# series statistic features

series_mean_mean = series_mean.mean()
series_mean_std = series_mean.std()
series_std_mean = series_std.mean()
series_std_std = series_std.std()
series_skew_mean = series_skew.mean()
series_skew_std = series_skew.std()
series_kurt_mean = series_kurt.mean()
series_kurt_std = series_kurt.std()

xy_series_mean = (series_mean - series_mean_mean) / xy_mean_std
xy_series_std = (series_std - series_std_mean) / xy_std_std
xy_series_skew = (series_skew - series_skew_mean) / xy_skew_std
xy_series_kurt = (series_kurt - series_kurt_mean) / xy_kurt_std

f1101_xy_statistic = np.stack([f1101_xy_mean, f1101_xy_std, f1101_xy_skew, f1101_xy_kurt], 
                              axis=1).astype("float32")
f1011_xy_stats = np.stack([xy_nan, xy_zero], axis=1).astype("float32")[:, :, MAX_LAGS:]

del f1101_xy_mean; del f1101_xy_std; del f1101_xy_skew; del f1101_xy_kurt; gc.collect()