In [236]:
import torch
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader 
from torch import nn
from torch.utils.data.sampler import SequentialSampler
from torch.utils.data import Sampler

In [98]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

In [99]:
device = get_default_device()

In [150]:
def to_device(data,device):
    if isinstance(data,(list,tuple)):
        return [to_device(x,device) for x in data]
    return data.to(device,non_blocking = True)
    

In [33]:
class StockDataset(Dataset):
    def __init__(self,start_year = 2002,end_year = 2017,window_size = 12):
        self.start_year = start_year
        self.end_year = end_year
        self.window_size = window_size
        self.data = self._load_data(self.start_year,self.end_year)
        self.x,self.y = self._create_features_and_labels(self.data,self.window_size)
        
    def _load_data(self,start_year,end_year):
        col_name = ["date","time","open","high","low","close","volume"]
        df_full = pd.DataFrame(columns = col_name)
        for root, dirs, files in os.walk("/stock_prediction/stock_prediction/EURGBP/"):
            for file in files:
                year = int(file.split(".")[0].split("_")[-1])
                if (file.endswith(".csv") and year >= start_year 
                and year <= end_year):
                    df = pd.read_csv(os.path.join(root,file),names = col_name)
                    df_full = pd.concat([df_full,df])
                    print("Stock data of year {0} loaded".format(str(year)))
        df_full.reset_index(inplace = True)
        return df_full
    
    
    def _create_features_and_labels(self,df,window_size):
        n = df.shape[0]
        x_array = np.empty(shape = (n - window_size,window_size))
        y_array = np.empty(shape = (n - window_size,1))
        close_index = df.close
        for i in tqdm(range(n-window_size),desc =  "Creating Features and Labels..."):
            x_array[i] = close_index.iloc[i:i+window_size].values
            y_array[i] = close_index.iloc[i + window_size]
        return x_array,y_array
    
    def __getitem__(self,idx):
        return self.x[idx],self.y[idx]
    
    def __len__(self):
        return self.x.shape[0]

In [34]:
data = StockDataset(start_year=2017)

Stock data of year 2017 loaded


Creating Features and Labels...: 100%|██████████████████████████████████████| 370241/370241 [00:14<00:00, 24725.67it/s]


In [238]:
class TestSampler(Sampler):
    r"""Samples elements sequentially, always in the same order.

    Arguments:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, data_source):
        self.data_source = data_source

    def __iter__(self):
        return iter(range(self.data_source[0],self.data_source[-1] + 1))

    def __len__(self):
        return len(self.data_source)

In [239]:
train_split = .8
dataset_size = len(data)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))
train_indices, test_indices = indices[:split], indices[split:]
train_sampler = SequentialSampler(train_indices)
test_sampler = TestSampler(test_indices)

In [240]:
train_dl = DataLoader(data,batch_size=32,sampler=train_sampler)
test_dl = DataLoader(data,batch_size=32,sampler=test_sampler)

In [257]:
class DeviceDataLoader():
    def __init__(self,dl,device):
        self.dl = dl
        self.device = device
    def __iter__(self):
        for batch in self.dl:
            yield to_device(batch,self.device)
    def __len__(self):
        return len(self.dl)

In [258]:
train_dl = DeviceDataLoader(train_dl,device)
test_dl = DeviceDataLoader(test_dl,device)

In [95]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[-1, :, :]) 
        # out.size() --> 100, 10
        return out

In [96]:
model = LSTMModel(input_dim = 1 , hidden_dim = 8, layer_dim = 1, output_dim = 1)

In [97]:
model(x.view((12,32,1)).float()).shape

torch.Size([32, 1])