In [None]:
!pip3 install pmdarima
!pip3 install pyplot

In [18]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pmdarima as pm
import numpy as np

RANDOM_SEED = 90

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"{device}" " is available.")

%matplotlib inline

# Ноутбук для ДЗ №8 обработка временных последовательностей RNN
В данном дз вам будут данных времменая последовательно, которая описывает распределение хитов по времени за несколько лет. Вам нужно будет обучить модель RNN на исторических данных и потом сделать предсказание для "будущего" года, в текущей задаче это 2019 год

## Готовим данные
Три файла с данными
1. исторические данные - train
2. Тестовые данные текущего момента - derived
3. Пример сабмита резкльтатов конкурса на кагле

In [2]:
def read_set(file):
    data = pd.read_csv(file)
    data['date'] = pd.to_datetime(data['date'])
    data = data.rename(columns = {'DATE':'date'})
    data = data.rename(columns = {'hits':'value'})
    data = data.set_index('date')
    return data

In [3]:
# get data
!wget https://github.com/Totenkaf/DL_Homeworks/raw/main/HW_8/data/derived.csv -P data/
!wget https://github.com/Totenkaf/DL_Homeworks/raw/main/HW_8/data/sample_submission.csv -P data/
!wget https://github.com/Totenkaf/DL_Homeworks/raw/main/HW_8/data/train.csv -P data/

In [19]:
# make subsidiary directories
!mkdir -p logs
!mkdir -p models
!mkdir -p submissions

In [20]:
data_train = read_set('data/train.csv')
data_test =  read_set('data/derived.csv')
data_sample = read_set('data/sample_submission.csv')

### Проверим наши данные, что мы загрузили

In [21]:
print(data_train.info())
print(data_test.info())
print(data_sample.info())

## Графики наших временных последовательностей

In [22]:
import plotly.graph_objs as go
from plotly.offline import iplot

def plot_datasets(df_s, names, title=None):
    data = []
    for i, df in enumerate(df_s):
        value = go.Scatter(
            x=df.index,
            y=df.value,
            mode="lines",
            name=names[i],
            marker=dict(),
            text=df.index,
        )

        data.append(value)

    layout = dict(
        title=title,
        xaxis=dict(title="Date", ticklen=5, zeroline=False),
        yaxis=dict(title="Value", ticklen=5, zeroline=False),
    )

    fig = dict(data=data, layout=layout)
    iplot(fig)

In [23]:
plot_datasets([data_train, data_test, data_sample], names=['Train', 'Test', 'Sample'], title='Data')

## Статистическая модель [ARIMA](https://ru.wikipedia.org/wiki/ARIMA)


In [10]:
# обучаем модель
arima_model=model = pm.auto_arima(data_train, 
                                   seasonal=True, m=4, test='adf', error_action='ignore',  
                                   suppress_warnings=True,
                                   stepwise=True, trace=True
                              )

In [11]:
prediction = pd.DataFrame(model.predict(n_periods=int(data_test.size)), data_test.index)

In [12]:
prediction = prediction.rename(columns = {0:'value'})

смотрим, что она нам предсказала

In [24]:
plot_datasets([data_train, data_test, prediction], names=['Train', 'Test', 'Arima'], title='Data')

In [25]:
res = calculate_metrics(data_test, data_sample)

### Функция подсчета метрик для конкурса

In [16]:
def MAPE(y_true, y_pred):
    mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), 1e-6)
    mape  = np.average(mape) * 100
    return mape

def calculate_metrics(value, prediction):
    result_metrics = {
        'MAE' : mean_absolute_error(value, prediction),
        'RMSE' : mean_squared_error(value, prediction, squared=False),
        'R2' : r2_score(value, prediction),
        'MAPE': MAPE(value, prediction)
    }
    
    print(f"Mean Absolute Error:       {result_metrics['MAE']}")
    print(f"Root Mean Squared Error:   {result_metrics['RMSE']}")
    print(f"R2 Score:                  {result_metrics['R2']}")
    print(f"MAPE Score:                {result_metrics['MAPE']}")

    return result_metrics

### MAPE для ARIMA и тестового сабмишена

## Из пандас строим датасет

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler

def get_scaler(scaler):
    scalers = {
        "minmax": MinMaxScaler,
        "standard": StandardScaler,
        "maxabs": MaxAbsScaler,
        "robust": RobustScaler,
    }
    return scalers.get(scaler.lower())()

In [None]:
class TSDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        # будем обучать окнами, поэтому сделаем отступ
        self._len = len(data) - (seq_len - 1)
        self._len -= 1 # Добавим 1 к seq_len, чтобы возвращать y
        print(data, self._len)
        self.data = data
        self.seq_len = seq_len
        
    def __len__(self):
        return self._len
    
    def __getitem__(self, idx):
        d = self.data[idx:idx + self.seq_len]
        targets = []
        days   = []
        months = []
        year = []
        y = []
        for i, row in enumerate(d.iterrows()):
            if i+1 < self.seq_len:
                targets += [ row[1]['value'] ]
                days += [ row[0].day ]
                months += [ row[0].month ]
                year += [ row[0].year ]
            else:
                y += [ row[1]['value'] ]
            
        return torch.LongTensor(days), \
                torch.LongTensor(months), \
                torch.LongTensor(year), \
                torch.FloatTensor(targets), \
                torch.FloatTensor(y)

In [None]:
data_train

In [None]:
data_test

In [None]:
data_train_scaled = data_train.copy()
data_test_scaled = data_test.copy()

shift_size = 20

data_test_shifted = pd.concat([ data_train_scaled.tail(shift_size), data_test_scaled])
data_shifted = data_train_scaled.iloc[:-shift_size]

# Нормализуем таргеты
scaler = get_scaler('minmax')

data_shifted[['value']] = scaler.fit_transform(data_shifted[['value']])
data_test_shifted[['value']] = scaler.transform(data_test_shifted[['value']])

In [None]:
ds_train = TSDataset(data_shifted, shift_size) 
ds_test  = TSDataset(data_test_shifted, shift_size)

In [None]:
print(ds_train[1057])
ds_test[0]

## Теперь нужно определить нашу модель 

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob, use_ctx=False):
        super(LSTMModel, self).__init__()

        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.use_ctx = use_ctx

        self.day_emb = nn.Embedding(32, input_dim[0])
        self.month_emb = nn.Embedding(13, input_dim[1])

        self.lstm = nn.LSTM(
            sum(input_dim)+1, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob, bias=True
        )

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, batch, ctx=None):
        # Initializing hidden state for first input with zeros
        days, months, _, x, y = batch

        days_tensor = self.day_emb(days)
        months_tensor = self.day_emb(months)

        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        if ctx == None or self.use_ctx == False:
            ctx = (h0, c0)

        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        # Forward propagation by passing in the input, hidden state, and cell state into the model

        input = torch.cat([days_tensor, months_tensor, x.unsqueeze(-1)], dim=-1)
        out, ctx = self.lstm(input, (ctx[0].detach(), ctx[1].detach()))

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)
        return out, ctx

### Определяем класс, в котором будем обучать

In [None]:
class Optimization:
    def __init__(self, model, loss_fn, optimizer, scheduler):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_losses = []
        self.val_losses = []
        
        self.ctx_last = None
        
    def train_step(self, batch, ctx=None):
        # Sets model to train mode
        self.model.train()

        # Makes predictions
        yhat, ctx = self.model(batch, ctx)

        y = batch[-1]
        # Computes loss
        loss = self.loss_fn(y, yhat)

        # Computes gradients
        loss.backward()

        # Updates parameters and zeroes gradients
        self.optimizer.step()
        self.optimizer.zero_grad()

        # Returns the loss
        return loss.item(), ctx

    def train(self, train_loader, val_loader, batch_size=64, n_epochs=50):
        model_path = f'{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
        
        ctx = None
        for epoch in tqdm(range(1, n_epochs + 1)):
            batch_losses = []
            for batch in train_loader:
                loss, ctx = self.train_step(batch, ctx)
                self.ctx_last = ctx
                batch_losses.append(loss)
                
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)
            
            with torch.no_grad():
                batch_val_losses = []
                ctx_val = None
                for batch_eval in val_loader:
                    self.model.eval()
                    yhat, ctx_val = self.model(batch_eval, ctx_val)
                    val_loss = self.loss_fn(batch_eval[-1], yhat).item()
                    batch_val_losses.append(val_loss)
                    
                validation_loss = np.mean(batch_val_losses)
                self.scheduler.step(validation_loss)
                self.val_losses.append(validation_loss)

            if epoch % 10 == 0:
                print(
                    f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}"
                )

        torch.save(self.model.state_dict(), model_path)

    def evaluate(self, test_loader, batch_size=1):
        with torch.no_grad():
            predictions = []
            values = []
            ctx = None
            for batch in tqdm(test_loader):
                self.model.eval()
                yhat, ctx = self.model(batch, ctx)
                predictions.append(yhat.to(device).detach().numpy())
                values.append(batch[-1].to(device).detach().numpy())

        return predictions, values
    
    def make_prediction(self, test_loader, batch_size=1):
        with torch.no_grad():
            self.model.eval()
            predictions = []
            values = []

            _, _, _, x_test, _ = next(iter(test_loader))
            ctx = (torch.unsqueeze(self.ctx_last[0][:, -1, :], 1), torch.unsqueeze(self.ctx_last[1][:, -1, :], 1))
            for batch in tqdm(test_loader):
                made_batch = [batch[0], batch[1], batch[2], x_test, batch[4]]

                y_test = batch[-1]
                yhat, ctx = self.model(made_batch, ctx)
                
                predictions.append(yhat.to(device).detach().numpy())
                values.append(y_test.to(device).detach().numpy())
                
                torch.reshape(x_test, (-1,))
                x_test = torch.cat((torch.reshape(x_test, (-1,)), torch.reshape(yhat, (-1,))))[None, 1:]
                

        return predictions, values
            

    def plot_losses(self):
        """The method plots the calculated loss values for training and validation
        """
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Test loss")
        plt.legend()
        plt.title("Losses")
        plt.show()
        plt.close()

### Определяем даталоадеры для теста и трейна

In [None]:
def get_loss_func(loss_name):
    losses = {
        'mse': nn.MSELoss(reduction="mean"),
        'l1': nn.L1Loss()
    }
    return losses[loss_name.lower()]

In [None]:
batch_size = 16
dl_train = DataLoader(ds_train, batch_size, shuffle=False, drop_last=True)
dl_test = DataLoader(ds_test, batch_size, shuffle=False, drop_last=True)
dl_test_one = DataLoader(ds_test, 1, shuffle=False)

In [None]:
# input_dim = len(X_train.columns)
input_dim = (4, 4)
output_dim = 1
hidden_dim = 64
layer_dim = 2
dropout = 0.2
n_epochs = 500
learning_rate = 1e-3
weight_decay = 1e-5

model_params = {'input_dim': input_dim,
                'hidden_dim': hidden_dim,
                'layer_dim': layer_dim,
                'output_dim': output_dim,
                'dropout_prob': dropout,
                'use_ctx': True
               }

model = LSTMModel(**model_params)

loss_fn = get_loss_func('l1')
optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, factor=0.1, min_lr=1e-8, verbose=True)


opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler)

In [None]:
opt.train(dl_train, dl_test, batch_size=batch_size, n_epochs=n_epochs)

In [None]:
opt.plot_losses()

In [None]:
# Predictions for evaluating model
predictions, values = opt.evaluate(
    dl_test_one,
    batch_size=1
)

In [None]:
def inverse_transform(scaler, df, columns):
    for col in columns:
        df[col] = scaler.inverse_transform(df[col])
    return df


def format_predictions(predictions, values, df_test, scaler):
    vals = np.concatenate(values, axis=0).ravel()
    preds = np.concatenate(predictions, axis=0).ravel()
    df_result = pd.DataFrame(data={"value": vals, "prediction": preds}, index=df_test.tail(len(vals)).index)
    df_result = df_result.sort_index()
    df_result = inverse_transform(scaler, df_result, [["value", "prediction"]])
    return df_result


df_result = format_predictions(predictions, values, data_test, scaler)
df_result

In [None]:
result_metrics = calculate_metrics(df_result.value, df_result.prediction)

In [None]:
eval_answer = pd.DataFrame(df_result['prediction']).rename(columns={'prediction': 'value'})
plot_datasets([data_train, data_test, eval_answer], names=['Train', 'Test', 'Eval'], title='Data')

In [None]:
# Predictions for real testing model
predictions, values = opt.make_prediction(
    dl_test_one,
    batch_size=1
)

df_prediction = format_predictions(predictions, values, data_test, scaler)
df_prediction

In [None]:
result_metrics = calculate_metrics(df_prediction.value, df_prediction.prediction)

In [None]:
answer = pd.DataFrame(df_prediction['prediction']).rename(columns={'prediction': 'value'})
plot_datasets([data_train, data_test, answer], names=['Train', 'Test', 'Preds'], title='Data')

In [None]:
answer.rename(columns={'value': 'hits'}).to_csv('answer.csv')