In [1]:
import sys 
sys.path.append("../")

import pandas as pd
import numpy as np
from IPython.display import display

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

import torch
from torch.utils.data import Dataset, DataLoader

from pgtaa import config as cfg
from pgtaa.core.utils import read_data

In [7]:
def reshaped_data(df: pd.DataFrame, window_size: int=100, pred_range: int=5, scaler=None):
    x = df.copy()
    y = df.iloc[:, :8] # select prediction labels
    y = np.log(y.pct_change(pred_range) + 1)
    y = y.values[pred_range + 1:]
    x.iloc[:, :8] = x.iloc[:, :8].pct_change(1)
    x = x[1:-pred_range]
    try:
        x = scaler.transform(x) # scaler has to be fitted already
    except Exception as e:
        print(e)
    
    xdata, ydata = [], []
    for step in range(x.shape[0] - window_size):
        xdata.append(x[step: window_size + step])
        ydata.append(y[window_size + step: window_size + step + 1])
        
    xdata = np.array(xdata)
    ydata = np.array(ydata).reshape((x.shape[0] - window_size, y.shape[1]))
    
    # xdata has the shape (samples, window_size, columns)
    # ydata has the shape (samples, nb_assets)
    return xdata, ydata


class TimeSeriesDataset(Dataset):
    def __init__(self, xdata, ydata):
        super(TimeSeriesDataset, self).__init__()
        self.xdata = xdata
        self.ydata = ydata
        
    def __getitem__(self, x):
        return torch.from_numpy(self.xdata[x]), torch.from_numpy(self.ydata[x])
    
    def __len__(self):
        return len(self.ydata)
    
    @classmethod
    def from_spec(cls, df: pd.DataFrame, pred_range: int=5):
        from pgtaa import config as cfg
        xdata, ydata = reshaped_data( 
            df=df,
            window_size=cfg.WINDOW_SIZE, 
            pred_range=pred_range, 
            scaler=cfg.get_scaler()
        )
        return cls(xdata, ydata)

In [63]:
#x, y = reshaped_data(df, pred_range=1, scaler=get_scaler())

In [64]:
#df.shape, x.shape, y.shape

((3123, 32), (3021, 100, 32), (3021, 8))

In [8]:
x = read_data(cfg.TRAIN_CSV, cfg.NB_ASSETS, return_array=False)
train = x.iloc[:int(len(x)*0.8)]
valid = x.iloc[int(len(x)*0.8):]
ds_train = TimeSeriesDataset.from_spec(train, pred_range=5)
ds_valid = TimeSeriesDataset.from_spec(valid, pred_range=5)

In [12]:
dl_train = DataLoader(ds_train, batch_size=20, shuffle=True, num_workers=2)
dl_valid = DataLoader(ds_valid, batch_size=20, shuffle=True, num_workers=2)

In [18]:
loss = 0.0
for i, data in enumerate(dl_train):
    loss += 1.5
print(loss / (i + 1))

1.5


In [20]:
print("Epochs: {}, Episodes: {}, Horizon: {}, Window Size: {}, Columns: {}".format(*p.pinit.windows.shape))

Epochs: 200, Episodes: 200, Horizon: 30, Window Size: 100, Columns: 32


In [75]:
import pandas as pd  
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

#from sklearn.svm import SVR
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.linear_model import SGDRegressor
#from sklearn.manifold import TSNE
#from sklearn.externals import joblib

from pgtaa.core.predictor_preproc import dl_from_spec

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

max_epochs = 5
train_dl, valid_dl = dl_from_spec()
print(len(train_dl.dataset))

# LSTM network
class PredLSTM(nn.Module):
    def __init__(self, input_dim=32, hidden_dim=(128, 64, 32), batch_size=32, output_dim=8):
        super(PredLSTM, self).__init__()
        self.lstm1 = nn.LSTMCell(input_dim, hidden_dim[0])
        self.lstm2 = nn.LSTMCell(hidden_dim[0], hidden_dim[1])
        self.lstm3 = nn.LSTMCell(hidden_dim[1], hidden_dim[2])
        self.linear = nn.Linear(hidden_dim[2], output_dim, bias=False)
        self.relu = nn.ReLU(inplace=False)
        self.dropout = nn.Dropout(p=0.4, inplace=False)
        self.batch_size = batch_size
    
    def forward(self, x):
        print(x.shape)
        x = self.lstm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.lstm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.lstm3(x)
        x = self.linear(x)
        return x
    
net = PredLSTM().double()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)   


for epoch in range(max_epochs):
    train_loss = 0.0
    val_loss = 0.0
    for i, data in enumerate(train_dl):
        inputs, label = data[0].to(device), data[1].to(device)
        print(inputs.shape)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        # (seq_length,1,input_dim)
        #x = x.view(len(x), self.batch_size, -1)
        outputs = net(inputs.view(100, 32, -1))
        print(outputs)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # validation
    with torch.set_grad_enabled(False):
        for j, vdata in enumerate(valid_dl):
            vinput, vlabel = vdata[0].to(device), vdata[1].to(device)
            outputs = net(vinput.view(100, 32, -1))
            loss = criterion(outputs, vlabel)
            val_loss += loss.item()

    print(f"Train Loss: {train_loss / (i + 1)}, Validation Loss: {val_loss / (j + 1)}")
    train_loss = 0.0
    val_loss = 0.0

print('Finished Training')

2392
torch.Size([32, 100, 32])
torch.Size([100, 32, 32])


RuntimeError: The size of tensor a (32) must match the size of tensor b (100) at non-singleton dimension 1