In [1]:
%%capture

!pip install darts
!pip install openpyxl

In [2]:
!pip install xlrd

In [3]:
!curl -L https://github.com/unit8co/amld2022-forecasting-and-metalearning/blob/main/data/m3_dataset.xls\?raw\=true -o m3_dataset.xls

In [29]:
%matplotlib inline

import warnings

warnings.filterwarnings("ignore")

import os
import time
import random
import pandas as pd
import pickle
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime
from itertools import product
import torch
from torch import nn
from typing import List, Tuple, Dict
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt

from darts import TimeSeries
from darts.utils.losses import SmapeLoss
from darts.dataprocessing.transformers import Scaler, InvertibleMapper
from darts.metrics import smape
from darts.utils.utils import SeasonalityMode, TrendMode, ModelMode
from darts.models import *

In [30]:
import sys
sys.setrecursionlimit(3000)

In [31]:
HORIZON=18

In [32]:
def load_m3() -> Tuple[List[TimeSeries], List[TimeSeries]]:
    print("building M3 TimeSeries...")

    # Read DataFrame
    df_m3 = pd.read_excel("m3_dataset.xls", "M3Month")

    # Build TimeSeries
    m3_series = []
    for row in tqdm(df_m3.iterrows()):
        s = row[1]
        start_year = int(s["Starting Year"])
        start_month = int(s["Starting Month"])
        values_series = s[6:].dropna()
        if start_month == 0:
            continue

        start_date = datetime(year=start_year, month=start_month, day=1)
        time_axis = pd.date_range(start_date, periods=len(values_series), freq="M")
        series = TimeSeries.from_times_and_values(
            time_axis, values_series.values
        ).astype(np.float32)
        m3_series.append(series)

    print("\nThere are {} monthly series in the M3 dataset".format(len(m3_series)))

    # Split train/test
    print("splitting train/test...")
    m3_train = [s[:-HORIZON] for s in m3_series]
    m3_test = [s[-HORIZON:] for s in m3_series]

    # Scale so that the largest value is 1
    print("scaling...")
    scaler_m3 = Scaler(scaler=MaxAbsScaler())
    m3_train_scaled: List[TimeSeries] = scaler_m3.fit_transform(m3_train)
    m3_test_scaled: List[TimeSeries] = scaler_m3.transform(m3_test)

    print(
        "done. There are {} series, with average training length {}".format(
            len(m3_train_scaled), np.mean([len(s) for s in m3_train_scaled])
        )
    )
    return m3_train_scaled, m3_test_scaled, scaler_m3

In [33]:
m3_train, m3_test, scaler = load_m3()

In [63]:
# nbeats_model_m4.fit(
#     m3_train,
#     num_loader_workers=4,
#     epochs=NUM_EPOCHS,
#     max_samples_per_ts=MAX_SAMPLES_PER_TS,
#     val_series=m3_test
# )

In [64]:
def eval_forecasts(
    pred_series: List[TimeSeries], test_series: List[TimeSeries]
) -> List[float]:

    print("computing sMAPEs...")
    smapes = smape(test_series, pred_series)
    plt.figure()
    plt.hist(smapes, bins=50)
    plt.ylabel("Count")
    plt.xlabel("sMAPE")
    plt.title("Median sMAPE: %.3f" % np.median(smapes))
    plt.show()
    plt.close()
    return smapes

In [65]:
df = pd.read_excel("../input/hacksaispb2022/data/Train.xlsx", sheet_name=0)

df['date'] = df[df.columns[0]]
df['date'] = [x.replace('m', '-') + '-01' for x in df['date']]

df = df.drop([df.columns[0]], axis=1)
fs = df.loc[0]
df = df.drop([0], axis=0)
df = df.reset_index(drop=True)

df.head()

In [66]:
for i, col in enumerate(fs.index):
    if fs[i] == 'ln':
        df[col] = np.e ** df[col]
df.head()

In [67]:
df = df.interpolate().bfill()

In [68]:
from darts import TimeSeries
series = [TimeSeries.from_dataframe(df, 'date', x) for x in df.columns[:-2]]

In [69]:
test_dfs = []

for i in range(1, 14 + 1):
    test_df = pd.read_excel(f"../input/hacksaispb2022/data/Test_example{i}.xlsx", engine="openpyxl", index_col="Unnamed: 0")
    try:
        test_df.index = pd.DatetimeIndex(map(lambda x: datetime.strptime(f"20{int(x[0]) + 1}{x[1:]}", "%Ym%m"), test_df.index), freq="MS")
    except ValueError as e:
#         test_df.index = pd.DatetimeIndex(map(lambda x: datetime.strptime(f"20{int(x[0]) + 1}{x[1:]}", "%Ym%m"), test_df.index), freq="QS-DEC")
        continue
    
    forcast_cols = [max(test_df[test_df[col] != "Forecast"].index) for col in test_df.columns]

    new_dfs = [TimeSeries.from_times_and_values(test_df.loc[:forcast_cols[idx]].index, test_df.loc[:forcast_cols[idx]][cur_df_name]) for idx, cur_df_name in enumerate(test_df.columns)]
#     dfs.extend(new_dfs)
    test_dfs.extend(new_dfs)

In [72]:
# toDailyAverage = InvertibleMapper(
#     fn=lambda timestamp, x: x / timestamp.days_in_month,
#     inverse_fn=lambda timestamp, x: x * timestamp.days_in_month,
# )
# #scaler = Scaler(MinMaxScaler(feature_range=(, 1)))
# scaler = Scaler()
# #pipeline = Pipeline([toDailyAverage, scaler])

# #train_transformed = pipeline.fit_transform(series)
# #val_transformed = pipeline.transform(X_test)

# # model = NBEATSModel(36, 12,
# #                  add_encoders={
# #                          'cyclic': {'future': ['month']},
# #                          'position': {'past': ['absolute'], 'future': ['relative']},
# #                      },
# #                 optimizer_kwargs={'lr': 2e-4},    
# #                 torch_device_str='cuda:0',
# #                 #pl_trainer_kwargs={"callbacks": [my_stopper]},
# #                 generic_architecture=True,
# #                 dropout=0.3, 
# #                 batch_size = 256,
# #                 )

# IN_LEN = 36
# OUT_LEN = 4

# # Architecture hyper-params:
# NUM_STACKS = 20
# NUM_BLOCKS = 2
# NUM_LAYERS = 2
# LAYER_WIDTH = 136
# COEFFS_DIM = 11

# # Training settings:
# LR = 1e-5
# BATCH_SIZE = 4
# MAX_SAMPLES_PER_TS = (
#     35  # <-- new parameter, limiting the number of training samples per series
# )
# NUM_EPOCHS = 10

# nbeats_model_m4 = NBEATSModel(
#     input_chunk_length=IN_LEN,
#     output_chunk_length=OUT_LEN,
#     batch_size=BATCH_SIZE,
#     num_stacks=NUM_STACKS,
#     num_blocks=NUM_BLOCKS,
#     num_layers=NUM_LAYERS,
#     layer_widths=LAYER_WIDTH,
#     expansion_coefficient_dim=COEFFS_DIM,
#     loss_fn=SmapeLoss(),
#     optimizer_kwargs={"lr": LR},
#     pl_trainer_kwargs={
#         "enable_progress_bar": True,
#         "accelerator": "gpu",
#         "gpus": -1,
#         "auto_select_gpus": True,
#     },
# )

#model = AutoARIMA(verbose=True, )

In [95]:
IN_LEN = 50
OUT_LEN = 4

# Architecture hyper-params:
NUM_STACKS = 20
NUM_BLOCKS = 15
NUM_LAYERS = 25
LAYER_WIDTH = 136
COEFFS_DIM = 11

# Training settings:
LR = 1e-4
BATCH_SIZE = 16
MAX_SAMPLES_PER_TS = (
    10  # <-- new parameter, limiting the number of training samples per series
)
NUM_EPOCHS = 5

nbeats_model_m4 = NBEATSModel(
    input_chunk_length=IN_LEN,
    output_chunk_length=OUT_LEN,
    batch_size=BATCH_SIZE,
    num_stacks=NUM_STACKS,
    num_blocks=NUM_BLOCKS,
    num_layers=NUM_LAYERS,
    layer_widths=LAYER_WIDTH,
    expansion_coefficient_dim=COEFFS_DIM,
    loss_fn=SmapeLoss(),
    optimizer_kwargs={"lr": LR},
    pl_trainer_kwargs={
        "enable_progress_bar": True,
        "accelerator": "gpu",
        "gpus": -1,
        "auto_select_gpus": True,
    },
)


In [96]:
nbeats_model_m4.fit(
    series,
    num_loader_workers=4,
    epochs=25,
    max_samples_per_ts=MAX_SAMPLES_PER_TS,
    val_series=test_dfs
)

In [93]:
import matplotlib.pyplot as plt
from darts.metrics import mape, smape

for i in range(1, 15):
    try:
        test_df = pd.read_excel(f"../input/hacksaispb2022/data/Test_example{i}.xlsx", engine="openpyxl", index_col="Unnamed: 0")
        test_df.index = pd.DatetimeIndex(map(lambda x: datetime.strptime(f"20{x}", "%Ym%m"), test_df.index), freq="MS")

        col_names = ["Var1", "Var2", "Var3", "Var4"]
        for col_name in col_names:
            X_test = TimeSeries.from_series(test_df[test_df[col_name] != "Forecast"][col_name]).astype(np.float32)
            # y_test = TimeSeries.from_dataframe(test_df[test_df["Var1"] == "Forecast"])
            X_val, y_val = X_test.split_before(.75)
            #X_val = scaler.transform(X_val)
            #y_val = scaler.transform(y_val)
            pred = nbeats_model_m4.predict(n = len(y_val), series=X_val, verbose=10)
            #pred = scaler.inverse_transform(pred)
        #         print(type(pred[0].values()[0][0]))
        #         print(type(X_val[0].values()[0][0]))
        #         print(type(pred))
            bias = pred[0].values()[0][0] - X_val[-1].values()[0][0]
            pred -= bias



            X_test.plot()
            pred.plot(label='forecast', low_quantile=0.05, high_quantile=0.95)
            plt.legend()
            plt.show()

            print("Mean absolute percentage error: {:.2f}%.".format(smape(y_val, pred)))
    except:
        pass

In [94]:
nbeats_model_m4.save('nbeats_goodboy_v2.pth')

Доучим модель на остальных столбцах

In [146]:
columns = list(df.columns)
columns.pop()
columns.pop(0)

In [147]:
from darts.metrics import mape
import matplotlib.pyplot as plt

In [None]:
val_history = []
pred_history = []

for i, col in enumerate(columns):
    #create time series for column 
    series = TimeSeries.from_dataframe(df, 'date', col)
    train, val = series.split_before(0.9)
    
    #transform data
    train_transformed = pipeline.transform(train)
    
    #fit & predict 
    model.fit(train_transformed, epochs=75)
    prediction = model.predict(len(val))
    
    #decode prediction
    pred = pipeline.inverse_transform(prediction)
    
    #count mape
    print(f"Mean absolute percentage error for {col}: {mape(val, pred)}%. ")
    series.plot()
    pred.plot(label='forecast', low_quantile=0.05, high_quantile=0.95)
    plt.legend()
    plt.show()
    
    if i % 5 == 0:
        val_history.append(val)
        pred_history.append(pred)

In [148]:
from darts import TimeSeries
series = TimeSeries.from_dataframe(df, 'date', df.columns[2])

train, val = series.split_before(0.9)
train.plot()

In [151]:
predict = model.predict(n=len(val))
#pred = pipeline.inverse_transform(predict)
val.plot()
pred.plot()

In [None]:
mape(val, pred)