In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler

In [2]:
pd.set_option("display.max_columns", None)

train_path=os.getcwd()+'/data/train.csv'
sup_path=os.getcwd()+'/data/supplemental_train.csv'
asset_path=os.getcwd()+'/data/asset_details.csv'

DEBUG=True

In [3]:
train = pd.read_csv(train_path).set_index("timestamp")
assets = pd.read_csv(asset_path)

assets_order = pd.read_csv(sup_path).Asset_ID[:14]
assets_order = dict((t,i) for i,t in enumerate(assets_order))

if DEBUG:
    train = train[10000000:]

train[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP','Target']] = \
train[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP','Target']].astype(np.float32)

train['Target'] = train['Target'].ffill()

In [4]:
VWAP_max = np.max(train[np.isfinite(train.VWAP)].VWAP)
VWAP_min = np.min(train[np.isfinite(train.VWAP)].VWAP)
print(VWAP_max, "\n", VWAP_min)

train['VWAP'] = np.nan_to_num(train.VWAP, posinf=VWAP_max, neginf=VWAP_min)

64799.8203125 
 -799.7470703125


In [5]:
def add_features(df):
    df['Upper_Shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['Lower_Shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    
    df['spread'] = df['High'] - df['Low']
    df['mean_trade'] = df['Volume']/df['Count']
    df['log_price_change'] = np.log(df['Close']/df['Open'])
    return df


In [6]:
train=add_features(train)

In [7]:
train.shape

(14236806, 14)

In [8]:
scale_features = train.columns.drop(['Asset_ID','Target'])
RS = RobustScaler()
train[scale_features] = RS.fit_transform(train[scale_features])

In [9]:
ind = train.index.unique()

def reindex(df):
    df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
    df = df.ffill().bfill()
    return df

train=train.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
train.shape

(14978530, 14)

In [10]:
df = train[['Asset_ID', 'Target']].copy()

times = dict((t,i) for i,t in enumerate(df.index.unique()))
df['id'] = df.index.map(times)
df['id'] = df['id'].astype(str) + '_' + df['Asset_ID'].astype(str)
ids = df.id.copy()

del df

In [11]:
train['group_num'] = train.index.map(times)
train = train.dropna(subset=['group_num'])
train['group_num'] = train['group_num'].astype('int')

train['id'] = train['group_num'].astype(str) + '_' + train['Asset_ID'].astype(str)

train['is_real'] = train.id.isin(ids)*1
train = train.drop('id', axis=1)

features = train.columns.drop(['Asset_ID','group_num','is_real'])
train.loc[train.is_real==0, features]=0

train['asset_order'] = train.Asset_ID.map(assets_order) 
train=train.sort_values(by=['group_num', 'asset_order'])
train.head(20)

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Upper_Shadow,Lower_Shadow,spread,mean_trade,log_price_change,group_num,is_real,asset_order
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1567988760,3,-0.1,-0.061726,-0.061663,-0.06178,-0.061727,3.241192,-0.061726,-0.003118,-0.044201,-0.044824,-0.051079,2.928198,0.270232,0,1,0
1567988760,2,-0.103125,1.153214,1.153046,1.154163,1.154362,-0.029595,1.153938,-0.001868,0.824421,1.090371,1.08192,-0.018208,1.11001,0,1,1
1567988760,0,-0.225,0.026766,0.026678,0.026764,0.026695,-0.017931,0.026716,-0.001786,-0.032581,-0.016819,-0.010301,0.009681,-0.977865,0,1,2
1567988760,1,0.753125,41.236725,41.254543,41.247993,41.239754,-0.03017,41.240818,3.1e-05,90.835754,62.930336,56.400654,-0.018964,0.064238,0,1,3
1567988760,4,-0.3,-0.061901,-0.061839,-0.061956,-0.061902,-0.009605,-0.061901,0.0,-0.04443,-0.044916,-0.051212,0.601599,0.0,0,1,4
1567988760,5,0.4375,-0.046982,-0.046899,-0.047032,-0.046981,0.257832,-0.046974,-0.001891,0.001777,-0.02299,-0.0259,0.053005,0.111591,0,1,5
1567988760,7,-0.221875,-0.035701,-0.035678,-0.035726,-0.035704,-0.013188,-0.035702,-0.000874,-0.035609,-0.031858,-0.042529,0.019062,-0.109026,0,1,6
1567988760,6,0.2125,0.655191,0.656425,0.655151,0.655766,-0.025427,0.655491,-0.000688,1.873816,1.244462,1.317384,-0.017208,0.938899,0,1,7
1567988760,8,-0.303125,-0.060943,-0.060882,-0.060996,-0.060944,-0.027158,-0.060943,-0.001638,-0.04443,-0.044916,-0.051212,0.178575,0.0,0,1,8
1567988760,9,0.05,0.217322,0.217963,0.217329,0.21759,-0.022785,0.217527,0.002092,0.780189,0.396992,0.506529,-0.015004,1.131879,0,1,9


In [12]:
targets = train['Target'].to_numpy()
#targets = np.expand_dims(targets, axis=1)

features = train.columns.drop(['Asset_ID', 'Target', 'group_num','is_real'])
train = train[features]

train=np.array(train)
train.shape

(14978530, 13)

In [13]:
class SampleGenerator(torch.utils.data.Dataset):
    def __init__(self, x_set, y_set, batch_size, length):
        self.x, self.y = x_set, y_set[:,np.newaxis] 
        self.batch_size = batch_size
        self.length = length
        self.size = len(x_set)

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = []
        batch_y = []
        for i in range(self.batch_size):
            start_ind = self.batch_size * idx + i
            end_ind = start_ind + self.length
            if end_ind <= self.size:
                batch_x.append(self.x[start_ind:end_ind])
                batch_y.append(self.y[end_ind - 1])

        return torch.tensor(np.array(batch_x)), torch.tensor(np.array(batch_y))

In [14]:
#last 10% of the data are used as validation set
X_train, X_test = train[:-len(train)//10], train[-len(train)//10:]
y_train, y_test = targets[:-len(train)//10], targets[-len(train)//10:]

In [15]:
Batch_size = 64
Input_size = 13  # Input feature size
Hidden_size = 64  # Hidden state size
Output_size = 1
Train_window=15

train_generator = SampleGenerator(X_train, y_train, length=Train_window, batch_size=Batch_size)
val_generator = SampleGenerator(X_test, y_test, length=Train_window, batch_size=Batch_size)

print(f'Sample shape: {train_generator[0][0].shape}')
print(f'Target shape: {train_generator[0][1].shape}')

Sample shape: torch.Size([64, 15, 13])
Target shape: torch.Size([64, 1])


In [16]:
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, train_window):
        super(LSTM, self).__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.seq_length = train_window

        self.lstm = nn.LSTM(input_size, hidden_size,batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq):
        
        lstm_out, _ = self.lstm(input_seq)
        #print(lstm_out.shape)
        output = self.fc(lstm_out[:,-1,:])  
        
        return output


In [23]:
def masked_cosine(y_true, y_pred):
    """
    Calculates masked cosine similarity between y_true and y_pred.

    Args:
        y_true (torch.Tensor): Ground truth tensor (target).
        y_pred (torch.Tensor): Predicted tensor.

    Returns:
        torch.Tensor: Masked cosine similarity.
    """
    mask = (y_true != 0)  # Create a mask where non-zero values are True
    y_true_masked = y_true[mask]
    y_pred_masked = y_pred[mask]

    # Calculate cosine similarity
    similarity = torch.nn.functional.cosine_similarity(y_true_masked, y_pred_masked, dim=0)

    return similarity


def masked_mse(y_true, y_pred):
    """
    Calculates masked cosine similarity between y_true and y_pred.

    Args:
        y_true (torch.Tensor): Ground truth tensor (target).
        y_pred (torch.Tensor): Predicted tensor.

    Returns:
        torch.Tensor: Masked cosine similarity.
    """
    mask = (y_true != 0)  # Create a mask where non-zero values are True
    y_true_masked = y_true[mask]
    y_pred_masked = y_pred[mask]

    # Calculate cosine similarity
    mse = torch.nn.functional.mse_loss(y_true_masked, y_pred_masked)

    return mse



def model_test():
    Error=0.0
    with torch.no_grad():
        for i in range(len(val_generator)):
            input = val_generator[i][0].float()
            target = val_generator[i][1].float()
            if input.shape[0]==0:
                break
            output = model(input)
            Error+=masked_mse(target, output).item()
    return Error/len(val_generator)


In [24]:
# Example usage


# Create an instance of the LSTM model
model = LSTM(Input_size, Hidden_size, Output_size,Train_window)

# Define loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

MSE=[]
# Training loop
#for epoch in range(1):  # Replace with desired number of epochs
    #total_loss = 0
for idx in range(len(train_generator)):
    optimizer.zero_grad()
    input_seq = train_generator[idx][0].float()
    if input_seq.shape[0]==0:
        break
    target = train_generator[idx][1].float()
    output = model(input_seq)
    #print(output.shape)
    loss = masked_mse(target,output)
    loss.backward()
    optimizer.step()
    if idx%10000==0:
        model.eval()
        mse=model_test()
        print(f"Processing {100*idx/len(train_generator)}%, the mse is {mse}")
        MSE.append(mse)
        model.train()

# Now your model is trained! You can use it for predictions.

Processing 0.0%, the mse is 0.006106798228488366
Processing 4.747526538673351%, the mse is 2.5946126134022618e-05
Processing 9.495053077346702%, the mse is 1.876750949734649e-05
Processing 14.242579616020054%, the mse is 1.716220831202906e-05
Processing 18.990106154693404%, the mse is 1.7458416898147434e-05
Processing 23.737632693366756%, the mse is 1.872622091565818e-05
Processing 28.485159232040107%, the mse is 1.8647627760400082e-05
Processing 33.23268577071346%, the mse is 1.724352607118065e-05
Processing 37.98021230938681%, the mse is 1.8124045844482115e-05
Processing 42.72773884806016%, the mse is 1.8466905341195772e-05
Processing 47.47526538673351%, the mse is 1.7486526791303526e-05
Processing 52.22279192540686%, the mse is 1.7616686170241312e-05
Processing 56.970318464080215%, the mse is 2.2381341782424626e-05
Processing 61.71784500275356%, the mse is 1.7664448752071427e-05
Processing 66.46537154142692%, the mse is 1.844727123265445e-05
Processing 71.21289808010027%, the mse is