In [12]:
import os
import torch
import torch.nn as nn
import torch.utils.data as Data
from sklearn.preprocessing import MinMaxScaler
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import talib
import plotly.graph_objects as go


In [61]:
path = "/Users/zed/AI_Lab/DoubleEnsembleML/Data/"
list = ["BTC",'DOGE',"ETC",'ETH','FIL','LTC','XRP']
data = pd.read_csv(path +list[0]+".csv")
data.columns 

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume (BTC)',
       'Volume (Currency)', 'Weighted Price'],
      dtype='object')

In [62]:
dataset = data
dataset['H-L'] = dataset['High'] - dataset['Low']
dataset['O-C'] = dataset['Close'] - dataset['Open']
dataset['3day MA'] = dataset['Close'].shift(1).rolling(window = 3).mean()
dataset['10day MA'] = dataset['Close'].shift(1).rolling(window = 10).mean()
dataset['30day MA'] = dataset['Close'].shift(1).rolling(window = 30).mean()
dataset['Std_dev']= dataset['Close'].rolling(5).std()
dataset['RSI'] = talib.RSI(dataset['Close'].values, timeperiod = 9)
dataset['ATR'] = talib.ATR(dataset['High'].values, dataset['Low'].values, dataset['Close'].values, 7)
data  = dataset.dropna(how = "any")
data 


Unnamed: 0,Date,Open,High,Low,Close,Volume (BTC),Volume (Currency),Weighted Price,H-L,O-C,3day MA,10day MA,30day MA,Std_dev,RSI,ATR
30,2014-02-06,802.50000,803.72713,767.60884,767.60884,33.592125,2.612999e+04,777.860501,36.11829,-34.89116,808.226667,807.572468,824.918283,20.390928,34.892078,27.788285
31,2014-02-07,775.20510,775.20510,666.35039,714.12206,64.823212,4.640043e+04,715.799678,108.85471,-61.08304,790.836280,806.333352,823.505245,42.058537,24.758009,39.369203
32,2014-02-08,711.41118,730.00000,680.01000,699.30255,31.042254,2.196645e+04,707.630659,49.99000,-12.10863,761.410300,796.146558,819.809884,48.505919,22.702711,40.886460
33,2014-02-09,692.37535,756.09980,684.85098,689.00000,38.291274,2.758184e+04,720.316621,71.24882,-3.37535,727.011150,786.576813,815.057658,48.591578,21.318570,45.223940
34,2014-02-10,686.83613,748.00000,550.00000,690.20000,29.856668,1.959739e+04,656.382333,198.00000,3.36387,700.808203,774.676813,809.446639,32.643148,21.942173,67.049091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2652,2021-04-21,56500.00000,56805.50000,53600.00000,53815.20000,4788.013532,2.645352e+08,55249.477244,3205.50000,-2684.80000,56153.266667,59957.890000,57851.923333,2276.730403,31.587568,3594.414995
2653,2021-04-22,53815.30000,55459.40000,50500.00000,51730.00000,10144.204074,5.396125e+08,53194.167081,4959.40000,-2085.30000,55345.066667,59343.810000,57842.593333,2012.718937,26.873534,3789.412853
2654,2021-04-23,51709.30000,52124.40000,47549.30000,51178.70000,11159.243380,5.545896e+08,49697.773594,4575.10000,-530.60000,54015.066667,58534.440000,57755.216667,2352.239108,25.731360,3901.653874
2655,2021-04-24,51178.60000,51200.00000,48746.40000,50093.40000,4039.459093,2.021033e+08,50032.268888,2453.60000,-1085.20000,52241.300000,57293.590000,57718.046667,2536.070407,23.517675,3694.789035


In [66]:
Y = data["Weighted Price"]
X = data.drop(["Date","Weighted Price"],axis=1 )

In [67]:
def sliding_windows(data, label, seq_length):
    x = []
    y = []

    for i in range(len(data)-seq_length-1):
        _x = data[i:(i+seq_length)]
        _y = label[i+seq_length,:]
        x.append(_x)
        y.append(_y)

    return np.array(x),np.array(y)

In [70]:
from sklearn.preprocessing import MinMaxScaler

Y = Y.values.reshape(-1,1)
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

#converting dataset into x_train and y_train
X = x_scaler.fit_transform(X)
Y = y_scaler.fit_transform(Y)


In [71]:
x, y = sliding_windows(X, Y, 15)

In [72]:
class LSTM(nn.Module):
    
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.batch_first = False
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers)
        self.th = nn.Tanh()
        self.fc = nn.Linear(hidden_size, num_classes)

        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        #不手动指定 h 和 c 的话，默认就是 0
        # h_0 = torch.zeros(
        #     self.num_layers, x.size(1), self.hidden_size)
        
        # c_0 = torch.zeros(
        #     self.num_layers, x.size(0), self.hidden_size)
        
        # #Propagate input through LSTM
        # ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        # ula, (h_out, _) = self.lstm(x)
        r_out, (h_n, h_c) = self.lstm(x)  # None 表示 hidden state 会用全0的 state
        out_0 = self.th(r_out)
        out = self.fc(r_out)
        return out
        
        # h_out = h_out.view(-1, self.hidden_size)
        
        # out = self.fc(h_out)
        # output = self.softmax(out)
        # return output

In [79]:
i = 0.8
y_train,y_test = y[:int(y.shape[0]*i)],y[int(y.shape[0]*i):]
x_train,x_test = x[:int(x.shape[0]*i)],x[int(x.shape[0]*i):]

# lstm: seq, batch, feature
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

dataX = torch.Tensor(x.transpose(1,0,2))
dataY = torch.Tensor(y)
trainX = torch.Tensor(x_train.transpose(1,0,2))
trainY = torch.Tensor(y_train)
testX = torch.Tensor(x_test.transpose(1,0,2))
testY = torch.Tensor(y_test)
trainX.shape, trainY.shape


(torch.Size([15, 2088, 14]), torch.Size([2088, 1]))

In [80]:
num_epochs = 1000
learning_rate = 0.002
device = "cpu"
input_size = X.shape[1] # The number of expected features in the input x
hidden_size = 100        # The number of features in the hidden state h
num_layers = 1 # Number of recurrent layers.
seq_length = 15
num_classes = 1 # output

lstm = LSTM(num_classes, input_size, hidden_size, num_layers)

criterion = torch.nn.MSELoss()  # mean-squared error for regression
#optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

# Train the model
lstm.train()
lstm.to(device)
trainX = trainX.to(device)
for epoch in tqdm(range(num_epochs)):
    optimizer.zero_grad()
    outputs = lstm(trainX)
    # obtain the loss function
    loss = criterion(outputs, trainY)
    
    loss.backward()
    
    optimizer.step()
    if epoch%200 == 0 :
        print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

  return F.mse_loss(input, target, reduction=self.reduction)
  0%|          | 1/1000 [00:00<05:48,  2.87it/s]

Epoch: 0, loss: 0.00409


 20%|██        | 201/1000 [00:45<02:59,  4.45it/s]

Epoch: 200, loss: 0.00013


 40%|████      | 401/1000 [01:30<02:18,  4.32it/s]

Epoch: 400, loss: 0.00011


 60%|██████    | 601/1000 [02:16<01:30,  4.42it/s]

Epoch: 600, loss: 0.00011


 80%|████████  | 801/1000 [03:02<00:45,  4.36it/s]

Epoch: 800, loss: 0.00010


100%|██████████| 1000/1000 [03:48<00:00,  4.38it/s]


In [91]:

lstm.eval()
lstm.to(torch.device(device))
with torch.no_grad():
    dataY_pred = lstm(dataX)

dataY_pred = dataY_pred.data.numpy()
dataY_truth = dataY.data.numpy()

dataY_pred = y_scaler.inverse_transform(dataY_pred)
dataY_truth = y_scaler.inverse_transform(dataY_truth)


fig = go.Figure(go.Scatter(y=dataY_truth.flatten(),name='Ground Truth'))
fig.add_trace(go.Scatter(y=dataY_pred.flatten(),name='Predicted'))

fig.update_layout(
    shapes = [dict(
        x0=len(x_train), x1=len(x_train), y0=0, y1=1, xref='x', yref='paper',
        line_width=2)], #在图上划分训练集和测试集
    xaxis_rangeslider_visible=True,
)



ValueError: Found array with dim 3. Estimator expected <= 2.