In [1]:
import yfinance as yf

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(0)

<torch._C.Generator at 0x122a54910>

# Data

In [2]:
def open_df(name, date_from, date_to):
    return yf.download(name, date_from, date_to, interval='1h').reset_index()

print('Downloading Brent data')
df_brent = open_df('BZ=F', "2021-01-01", "2022-12-12")
print('Downloading USD/RUB data')
df_usd = open_df('RUB=X', "2021-01-01", "2022-12-12")

df = pd.merge(df_usd, df_brent, on='index')
df.drop(columns=['Volume_x'], inplace=True)
df.set_index('index', inplace=True)
df.shape

Downloading Brent data
[*********************100%***********************]  1 of 1 completed
Downloading USD/RUB data
[*********************100%***********************]  1 of 1 completed


(8867, 11)

In [3]:
train_size = 7000

In [4]:
def sliding_window_features(df, target_column, size):
    df_new = df.sort_index(ascending=True).copy()

    columns = df.columns
    feature_columns = list(set(columns) - {target_column,})
    for column in columns:
        for shift in range(1, size + 1):
            feature_values = df_new[column].values
            feature_values_shifted = np.roll(feature_values, -shift)
            df_new[column + '_' + str(shift)] = feature_values_shifted

    y = df_new[target_column]

    df_new = df_new.drop(columns, axis=1)

    res_df = df_new.iloc[:-size]
    res_y = pd.Series(np.roll(y.diff(), -1)[:-size], index=res_df.index)
    return res_df, res_y

# Baseline

In [5]:
from catboost import CatBoostRegressor

widow_size = 1
X, y = sliding_window_features(df, 'Close_x', widow_size)
cb = CatBoostRegressor()
cb.fit(X[:train_size], y[:train_size], verbose=False)
print(f'RMSE:', np.sqrt(np.mean((y[train_size:] - cb.predict(X[train_size:]))**2)))

RMSE: 0.588099889393787


In [6]:
from catboost import CatBoostRegressor

widow_size = 5
X, y = sliding_window_features(df, 'Close_x', widow_size)
cb = CatBoostRegressor()
cb.fit(X[:train_size], y[:train_size], verbose=False)
print(f'RMSE:', np.sqrt(np.mean((y[train_size:] - cb.predict(X[train_size:]))**2)))

RMSE: 0.5912915270204362


In [7]:
from catboost import CatBoostRegressor

widow_size = 10
X, y = sliding_window_features(df, 'Close_x', widow_size)
cb = CatBoostRegressor()
cb.fit(X[:train_size], y[:train_size], verbose=False)
print(f'RMSE:', np.sqrt(np.mean((y[train_size:] - cb.predict(X[train_size:]))**2)))

  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted
  df_new[column + '_' + str(shift)] = feature_values_shifted


RMSE: 0.5742990231469814


# RMDN (Recurrent Mixture Density Network)

In [8]:
class FinancialDataset(Dataset):
    def __init__(self, X, y, sequence_length):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.y) - self.sequence_length

    def __getitem__(self, idx):
        return self.X[idx:idx+self.sequence_length], self.y[idx+self.sequence_length]

X, y = sliding_window_features(df, 'Close_x', 1)

X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

In [9]:
widow_size = 1

train_dataset1 = FinancialDataset(X_train, y_train, widow_size)
test_dataset1 = FinancialDataset(X_test, y_test, widow_size)

train_dataloader1 = DataLoader(train_dataset1, batch_size=16, shuffle=False)
test_dataloader1 = DataLoader(test_dataset1, batch_size=16, shuffle=False)

In [10]:
widow_size = 5

train_dataset5 = FinancialDataset(X_train, y_train, widow_size)
test_dataset5 = FinancialDataset(X_test, y_test, widow_size)

train_dataloader5 = DataLoader(train_dataset5, batch_size=16, shuffle=False)
test_dataloader5 = DataLoader(test_dataset5, batch_size=16, shuffle=False)

In [11]:
widow_size = 10

train_dataset10 = FinancialDataset(X_train, y_train, widow_size)
test_dataset10 = FinancialDataset(X_test, y_test, widow_size)

train_dataloader10 = DataLoader(train_dataset10, batch_size=16, shuffle=False)
test_dataloader10 = DataLoader(test_dataset10, batch_size=16, shuffle=False)

In [12]:
class MDN(nn.Module):
    def __init__(self, in_features, num_distributions):
        super(MDN, self).__init__()
        self.in_features = in_features
        self.num_distributions = num_distributions
        
        self.pi = nn.Linear(in_features, num_distributions)
        self.mu = nn.Linear(in_features, num_distributions)
        self.sigma = nn.Linear(in_features, num_distributions)
        self.elu = nn.ELU()

    def forward(self, x):
        pi = F.gumbel_softmax(self.pi(x), dim=-1, tau=1)
        mu = self.mu(x)
        sigma = self.elu(self.sigma(x)) + 1 + 1e-15
        return pi, mu, sigma


class RMDN(nn.Module):
    def __init__(self, in_features=11, hidden_size=32, num_distributions=2):
        super(RMDN, self).__init__()
        self.in_features = in_features
        self.hidden_size = hidden_size
        self.num_distributions = num_distributions
        
        self.GRU = nn.GRU(in_features, hidden_size, batch_first=True)
        self.MDN = MDN(hidden_size, num_distributions)
    
    def forward(self, x):
        h0 = torch.autograd.Variable(torch.zeros(1, x.shape[0], self.hidden_size))
        out, h = self.GRU(x, h0)
        out = out[:,-1,:]
        pi, mu, sigma = self.MDN(out)
        return pi, mu, sigma

In [13]:
def test_rmse(model, test_dataset, test_dataloader):
    mse = 0
    with torch.inference_mode():
        for batch_x, batch_y in tqdm(test_dataloader, desc='Test'):
            pi, mu, sigma = model(batch_x)
            value = torch.sum(pi * mu, dim=1)  # expected values
            err = batch_y - value
            mse += torch.sum(err**2)
    return torch.sqrt(mse / len(test_dataset))

In [14]:
ONEOVERSQRT2PI = 1.0 / np.sqrt(2 * np.pi)
LOG2PI = np.log(2 * np.pi)

def log_pdf(sigma, mu, target):
    target = target.expand_as(sigma)
    return -torch.log(sigma) - 0.5 * LOG2PI - 0.5 * torch.pow((target - mu) / sigma, 2)

def log_prob_y(pi, sigma, mu, y):
    log_component_prob = log_pdf(sigma, mu, y)
    log_mix_prob = torch.log(pi + 1e-15)
    return torch.logsumexp(log_component_prob + log_mix_prob, dim=-1)

def calculate_loss(y, pi, mu, sigma, model, lambda_pi=1.0):
    log_prob = log_prob_y(pi, sigma, mu, y)
    loss = torch.mean(-log_prob)
    
    pi_l1_reg = 0
    if lambda_pi > 0:
        pi_params = torch.cat([x.view(-1) for x in model.MDN.pi.parameters()])
        pi_l1_reg = lambda_pi * torch.norm(pi_params, 1)

    loss = loss + pi_l1_reg
    return loss

Обучение

In [15]:
num_distributions = 3
hidden_size = 4
lambda_pi = 1
EPOCHS = 10

model1 = RMDN(num_distributions=num_distributions, hidden_size=hidden_size)
opt = torch.optim.Adam(model1.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    losses = []
    for batch_x, batch_y in tqdm(train_dataloader1, desc=f'Epoch #{epoch + 1}'):
        pi, mu, sigma = model1(batch_x)
        opt.zero_grad()
        loss = calculate_loss(torch.unsqueeze(batch_y, 1), pi, mu, sigma, model1, lambda_pi)
        losses.append(loss.item())
        loss.backward()
        opt.step()
    print('Mean loss:', np.mean(losses))
    print('RMSE:', test_rmse(model1, test_dataset1, test_dataloader1).item())
    print()

Epoch #1: 100%|██████████████████████████████| 438/438 [00:00<00:00, 715.33it/s]


Mean loss: 2.296644694865022


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3427.77it/s]


RMSE: 0.5971273183822632



Epoch #2: 100%|██████████████████████████████| 438/438 [00:00<00:00, 749.37it/s]


Mean loss: 0.8069088954104272


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3430.10it/s]


RMSE: 0.5923519134521484



Epoch #3: 100%|██████████████████████████████| 438/438 [00:00<00:00, 749.10it/s]


Mean loss: 0.5725885226421025


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3412.40it/s]


RMSE: 0.5896502137184143



Epoch #4: 100%|██████████████████████████████| 438/438 [00:00<00:00, 765.52it/s]


Mean loss: 0.42528837803085695


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3360.73it/s]


RMSE: 0.5925706028938293



Epoch #5: 100%|██████████████████████████████| 438/438 [00:00<00:00, 767.08it/s]


Mean loss: 0.34557035468841935


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3025.76it/s]


RMSE: 0.5872411727905273



Epoch #6: 100%|██████████████████████████████| 438/438 [00:00<00:00, 748.25it/s]


Mean loss: 0.33067598465360176


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3117.57it/s]


RMSE: 0.5875681042671204



Epoch #7: 100%|██████████████████████████████| 438/438 [00:00<00:00, 769.41it/s]


Mean loss: 0.3292898225340754


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3244.33it/s]


RMSE: 0.5870961546897888



Epoch #8: 100%|██████████████████████████████| 438/438 [00:00<00:00, 768.45it/s]


Mean loss: 0.2923473850643203


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3037.30it/s]


RMSE: 0.5853250026702881



Epoch #9: 100%|██████████████████████████████| 438/438 [00:00<00:00, 761.16it/s]


Mean loss: 0.2686071776504284


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3266.70it/s]


RMSE: 0.5861722230911255



Epoch #10: 100%|█████████████████████████████| 438/438 [00:00<00:00, 766.41it/s]


Mean loss: 0.26257005769341596


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 3214.22it/s]

RMSE: 0.5845715403556824






In [16]:
num_distributions = 3
hidden_size = 4
lambda_pi = 1
EPOCHS = 10

model5 = RMDN(num_distributions=num_distributions, hidden_size=hidden_size)
opt = torch.optim.Adam(model5.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    losses = []
    for batch_x, batch_y in tqdm(train_dataloader5, desc=f'Epoch #{epoch + 1}'):
        pi, mu, sigma = model5(batch_x)
        opt.zero_grad()
        loss = calculate_loss(torch.unsqueeze(batch_y, 1), pi, mu, sigma, model5, lambda_pi)
        losses.append(loss.item())
        loss.backward()
        opt.step()
    print('Mean loss:', np.mean(losses))
    print('RMSE:', test_rmse(model5, test_dataset5, test_dataloader5).item())
    print()

Epoch #1: 100%|██████████████████████████████| 438/438 [00:00<00:00, 552.56it/s]


Mean loss: 3.6001541000114727


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2626.27it/s]


RMSE: 0.5778466463088989



Epoch #2: 100%|██████████████████████████████| 438/438 [00:00<00:00, 538.49it/s]


Mean loss: 0.928514900170777


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2682.98it/s]


RMSE: 0.5751797556877136



Epoch #3: 100%|██████████████████████████████| 438/438 [00:00<00:00, 539.48it/s]


Mean loss: 0.599725710272738


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2591.78it/s]


RMSE: 0.575258731842041



Epoch #4: 100%|██████████████████████████████| 438/438 [00:00<00:00, 546.10it/s]


Mean loss: 0.450636208088235


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2566.45it/s]


RMSE: 0.575262725353241



Epoch #5: 100%|██████████████████████████████| 438/438 [00:00<00:00, 558.73it/s]


Mean loss: 0.35003975385026026


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2730.53it/s]


RMSE: 0.5757960081100464



Epoch #6: 100%|██████████████████████████████| 438/438 [00:00<00:00, 552.45it/s]


Mean loss: 0.32964971010279775


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2459.16it/s]


RMSE: 0.5752806663513184



Epoch #7: 100%|██████████████████████████████| 438/438 [00:00<00:00, 562.84it/s]


Mean loss: 0.3079636396197538


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2344.18it/s]


RMSE: 0.5756882429122925



Epoch #8: 100%|██████████████████████████████| 438/438 [00:00<00:00, 557.85it/s]


Mean loss: 0.2878580707094534


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2312.44it/s]


RMSE: 0.5758106708526611



Epoch #9: 100%|██████████████████████████████| 438/438 [00:00<00:00, 559.35it/s]


Mean loss: 0.26278335006575876


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2290.62it/s]


RMSE: 0.5760579109191895



Epoch #10: 100%|█████████████████████████████| 438/438 [00:00<00:00, 559.90it/s]


Mean loss: 0.23585541160951567


Test: 100%|█████████████████████████████████| 117/117 [00:00<00:00, 2699.16it/s]

RMSE: 0.5749793648719788






In [17]:
num_distributions = 3
hidden_size = 4
lambda_pi = 1
EPOCHS = 10

model10 = RMDN(num_distributions=num_distributions, hidden_size=hidden_size)
opt = torch.optim.Adam(model10.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    losses = []
    for batch_x, batch_y in tqdm(train_dataloader10, desc=f'Epoch #{epoch + 1}'):
        pi, mu, sigma = model10(batch_x)
        opt.zero_grad()
        loss = calculate_loss(torch.unsqueeze(batch_y, 1), pi, mu, sigma, model10, lambda_pi)
        losses.append(loss.item())
        loss.backward()
        opt.step()
    print('Mean loss:', np.mean(losses))
    print('RMSE:', test_rmse(model10, test_dataset10, test_dataloader10).item())
    print()

Epoch #1: 100%|██████████████████████████████| 437/437 [00:01<00:00, 408.09it/s]


Mean loss: 3.6531226758416784


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2160.62it/s]


RMSE: 0.584004819393158



Epoch #2: 100%|██████████████████████████████| 437/437 [00:01<00:00, 414.93it/s]


Mean loss: 1.0951985549373506


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2310.17it/s]


RMSE: 0.5801214575767517



Epoch #3: 100%|██████████████████████████████| 437/437 [00:01<00:00, 407.13it/s]


Mean loss: 0.6562793080948583


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2023.48it/s]


RMSE: 0.5899003148078918



Epoch #4: 100%|██████████████████████████████| 437/437 [00:01<00:00, 418.70it/s]


Mean loss: 0.4894120993403807


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2264.33it/s]


RMSE: 0.5808768272399902



Epoch #5: 100%|██████████████████████████████| 437/437 [00:01<00:00, 418.34it/s]


Mean loss: 0.3888614271043904


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2271.56it/s]


RMSE: 0.5824952125549316



Epoch #6: 100%|██████████████████████████████| 437/437 [00:01<00:00, 417.34it/s]


Mean loss: 0.3316093989790713


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2248.72it/s]


RMSE: 0.5766773819923401



Epoch #7: 100%|██████████████████████████████| 437/437 [00:01<00:00, 427.05it/s]


Mean loss: 0.2671444365473409


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2250.45it/s]


RMSE: 0.5790630578994751



Epoch #8: 100%|██████████████████████████████| 437/437 [00:01<00:00, 425.74it/s]


Mean loss: 0.265531087352029


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2201.52it/s]


RMSE: 0.5771614909172058



Epoch #9: 100%|██████████████████████████████| 437/437 [00:01<00:00, 358.03it/s]


Mean loss: 0.2605245318833858


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 1634.29it/s]


RMSE: 0.576400876045227



Epoch #10: 100%|█████████████████████████████| 437/437 [00:01<00:00, 362.39it/s]


Mean loss: 0.2468990767768902


Test: 100%|█████████████████████████████████| 116/116 [00:00<00:00, 2119.92it/s]

RMSE: 0.5767474174499512






Применение для определение цены

In [22]:
THR = 1.5

mse = 0
num = 0
with torch.inference_mode():
    for batch_x, batch_y in tqdm(test_dataloader1):
        pi, mu, sigma = model1(batch_x)
        value = torch.sum(pi * mu, dim=1)  # expected values
        recomended = torch.exp(log_prob_y(pi, sigma, mu, torch.unsqueeze(value, 1))) > THR
        err = (batch_y - value)[recomended]
        mse += torch.sum(err**2)
        num += recomended.sum()
torch.sqrt(mse / num).item(), num.item() / len(test_dataset1)

100%|███████████████████████████████████████| 117/117 [00:00<00:00, 1883.28it/s]


(0.5268271565437317, 0.054691689008042894)

In [19]:
THR = 3

mse = 0
num = 0
with torch.inference_mode():
    for batch_x, batch_y in tqdm(test_dataloader5):
        pi, mu, sigma = model5(batch_x)
        value = torch.sum(pi * mu, dim=1)  # expected values
        recomended = torch.exp(log_prob_y(pi, sigma, mu, torch.unsqueeze(value, 1))) > THR
        err = (batch_y - value)[recomended]
        mse += torch.sum(err**2)
        num += recomended.sum()
torch.sqrt(mse / num).item(), num.item() / len(test_dataset5)

100%|███████████████████████████████████████| 117/117 [00:00<00:00, 1632.48it/s]


(0.5685878396034241, 0.25040300913487373)

In [20]:
THR = 3

mse = 0
num = 0
with torch.inference_mode():
    for batch_x, batch_y in tqdm(test_dataloader10):
        pi, mu, sigma = model10(batch_x)
        value = torch.sum(pi * mu, dim=1)  # expected values
        recomended = torch.exp(log_prob_y(pi, sigma, mu, torch.unsqueeze(value, 1))) > THR
        err = (batch_y - value)[recomended]
        mse += torch.sum(err**2)
        num += recomended.sum()
torch.sqrt(mse / num).item(), num.item() / len(test_dataset10)

100%|███████████████████████████████████████| 116/116 [00:00<00:00, 1395.44it/s]


(0.5197395086288452, 0.11961206896551724)