In [1]:
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import os
from tqdm import *
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np

device = "cuda"
# device = "cpu"
torch.device(device)

device(type='cuda')

In [2]:
def read_set(file):
    data = pd.read_csv(file)
    data['date'] = pd.to_datetime(data['date'])
    data = data.rename(columns = {'DATE':'date'})
    data = data.rename(columns = {'hits':'value'})
    data = data.set_index('date')
    return data

In [3]:
def MAPE(y_true, y_pred):
    mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), 1e-6)
    mape  = np.average(mape) * 100
    return mape

In [4]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(101)

In [5]:
from sklearn.preprocessing import StandardScaler
data_train = read_set('train.csv')

scaler = StandardScaler()
data_train[:] = scaler.fit_transform(data_train)

data_train['date'] = pd.to_datetime(data_train.index)
data_train['day'] = data_train['date'].apply(lambda date: date.day)
data_train['week'] = data_train['date'].apply(lambda date: date.week)
data_train['month'] = data_train['date'].apply(lambda date: date.month)
data_train["weekday"] = data_train['date'].apply(lambda date: date.dayofweek)
data_train = data_train.drop(columns='date', axis=0)

data_train

Unnamed: 0_level_0,value,day,week,month,weekday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,-1.006549,1,53,1,4
2016-01-02,-0.731399,2,53,1,5
2016-01-03,-0.592023,3,53,1,6
2016-01-04,-0.254630,4,1,1,0
2016-01-05,-0.157837,5,1,1,1
...,...,...,...,...,...
2018-12-27,-0.496334,27,52,12,3
2018-12-28,-0.578033,28,52,12,4
2018-12-29,-1.085962,29,52,12,5
2018-12-30,-2.001954,30,52,12,6


In [6]:
# я не буду использовать оттуда числа, 
# хочу получить шаблон с датами
# чтобы не мучатся с эмбеддингами фичей из даты
data_test = read_set('derived.csv')

data_test['date'] = pd.to_datetime(data_test.index)
data_test['day'] = data_test['date'].apply(lambda date: date.day)
data_test['week'] = data_test['date'].apply(lambda date: date.week)
data_test['month'] = data_test['date'].apply(lambda date: date.month)
data_test["weekday"] = data_test['date'].apply(lambda date: date.dayofweek)
data_test = data_test.drop(columns='date', axis=0)

# не буду занулять для удобства
# data_test["value"] = 0.0 
df = pd.concat([data_train, data_test])
df

Unnamed: 0_level_0,value,day,week,month,weekday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,-1.006549e+00,1,53,1,4
2016-01-02,-7.313992e-01,2,53,1,5
2016-01-03,-5.920227e-01,3,53,1,6
2016-01-04,-2.546302e-01,4,1,1,0
2016-01-05,-1.578371e-01,5,1,1,1
...,...,...,...,...,...
2019-12-27,2.486813e+08,27,52,12,4
2019-12-28,1.394860e+08,28,52,12,5
2019-12-29,1.222826e+08,29,52,12,6
2019-12-30,2.256384e+08,30,1,12,0


In [7]:
class SeqToSeqDatasetWithDate(Dataset):
    def __init__(
        self,
        dataframe,
        sequence_length=96,
        forecast_length=24,
        device="cpu",
    ):
        self.sequence_length = sequence_length
        self.forecast_length = forecast_length
        self.device = device

        self.data = dataframe.values

    def __len__(self):
        return self.data.shape[0] - self.sequence_length - self.forecast_length + 1

    def __getitem__(self, i):
        src_start = i
        src_end = src_start + self.sequence_length
        
        y_start = src_end
        y_end = y_start + self.forecast_length

        seq_x = torch.tensor(self.data[src_start:src_end, 0]).float()
        seq_y = torch.tensor(self.data[y_start:y_end, 0]).float()
        
        seq_x_dates = torch.tensor(self.data[src_start:src_end, 1:]).long()

        return seq_x.to(self.device), seq_x_dates.to(self.device),  seq_y.to(self.device)


In [8]:
batch_size = 64
sequence_length = 720
forecast_length = 1

train_dataset = SeqToSeqDatasetWithDate(
    data_train, sequence_length, forecast_length, device
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

X, dates, y = next(iter(train_loader))

print(f"Features:\t{X.shape}")
print(f"dates:\t\t{dates.shape}")
print(f"Target:\t\t{y.shape}")

df_dataset = SeqToSeqDatasetWithDate(
    df, sequence_length, forecast_length, device
)


Features:	torch.Size([64, 720])
dates:		torch.Size([64, 720, 4])
Target:		torch.Size([64, 1])


In [9]:
class TimeSeriesModel(nn.Module):
    def __init__(self, hidden_size: int, discrete_features, input_size):
        super().__init__()
        self.day_emb = nn.Embedding(31 + 1, discrete_features[0])
        self.week_emb = nn.Embedding(54 + 1, discrete_features[1])
        self.mon_emb = nn.Embedding(12 + 1, discrete_features[2])
        self.weekday_emb = nn.Embedding(7 + 1, discrete_features[3])

        self._rnn = nn.LSTM(
            sum(discrete_features) + input_size,
            hidden_size,
            batch_first=True,
        )
        self._output = nn.Linear(hidden_size, 1)

    def forward(self, src, dates):
        days = dates[:, :, 0]
        days = self.day_emb(days)

        weeks = dates[:, :, 1]
        weeks = self.week_emb(weeks)

        mons = dates[:, :, 2]
        mons = self.mon_emb(mons)

        weekdays = dates[:, :, 3]
        weekdays = self.weekday_emb(weekdays)

        src = torch.cat([src, days, weeks, mons, weekdays], dim=-1)

        out, _ = self._rnn(src)

        out = self._output(out[:, -1, :])

        return out


In [10]:
def get_forecast(model):
    # последняя известная последовательность, дальше уже 
    # value из теста
    src, _, _ = df_dataset[len(train_dataset)]
    # фейковая размерность батча
    src = src.unsqueeze(0) 

    outs = []
    with torch.no_grad():
        for t in range(len(data_test)):
            # из расширенного датасета берём только даты
            _, dates, _ = df_dataset[len(train_dataset)+t]
            dates = dates.unsqueeze(0)
            
            out = model.forward(src.unsqueeze(-1), dates)
            
            out = out[0,0]
            
            src = src.roll(-1)
            src[0,-1] = out
            
            
            outs.append(out.item())
    return outs

In [16]:
model = TimeSeriesModel(64, (4, 4, 4, 4), 1).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)
loss_function = torch.nn.MSELoss()


In [17]:
writer = SummaryWriter()

for epoch in range(1, 40 + 1):
    total_loss = 0
    num_batches = len(train_loader)

    for X, dates, y in train_loader:
        out = model.forward(X.unsqueeze(-1), dates)

        loss = loss_function(out, y)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    writer.add_scalar("Loss/train", avg_loss, epoch)

writer.close()


In [18]:
pd.DataFrame(
    {
        "date": data_test.index,
        "hits": np.round(scaler.inverse_transform(get_forecast(model))).astype("int"),
    }
).to_csv("submission.csv", index=False)
