In [7]:
!pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.54.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.7/163.7 kB[0m [31m797.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.9.2-cp311-cp311-manyli

In [8]:
import os
import torch
import random
import requests
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import os
# from sklearn import preprocessing
import matplotlib.pyplot as plt
import copy
import time
plt.style.use('ggplot')

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BaseTrainer:
    def __init__(self, model, criterion, optimizer, train_loader, val_loader):
        self.model = model
        self.criterion = criterion  #the loss function
        self.optimizer = optimizer  #the optimizer
        self.train_loader = train_loader  #the train loader
        self.val_loader = val_loader  #the valid loader

    #the function to train the model in many epochs
    def fit(self, num_epochs):
        self.num_batches = len(self.train_loader)

        for epoch in range(num_epochs):
            print(f'Epoch {epoch + 1}/{num_epochs}')
            start = time.time()
            train_loss, train_accuracy = self.train_one_epoch()
            val_loss, val_accuracy = self.validate_one_epoch()
            end = time.time()
            print(f"total time for each epoch {end - start}")
            print(
                f'{self.num_batches}/{self.num_batches} - train_loss: {train_loss:.4f} - train_accuracy: {train_accuracy*100:.4f}% \
                - val_loss: {val_loss:.4f} - val_accuracy: {val_accuracy*100:.4f}%')
            
    #the function to train the model in many epochs
    def fit_withpatience(self, num_epochs, patience):
        p_count = patience
        model_states = []
        self.num_batches = len(self.train_loader)
        min_val_loss = float('inf')

        max_train_acc = 0.0
        max_val_acc = 0.0
        for epoch in range(num_epochs):
            print(f'Epoch {epoch + 1}/{num_epochs}')
            start = time.time()
            train_loss, train_accuracy = self.train_one_epoch()
            val_loss, val_accuracy = self.validate_one_epoch()
            end = time.time()
            print(f"total time for each epoch {end - start}")
            if val_loss < min_val_loss:
                min_val_loss = val_loss
                p_count=patience
            else:
                p_count-=1

            if p_count <= 0:
                print('Stopping training')
                self.model.load_state_dict(model_states[-1*patience])
                # print(self.model.state_dict())
                return max_train_acc, max_val_acc
            print(
                f'{self.num_batches}/{self.num_batches} - train_loss: {train_loss:.4f} - train_accuracy: {train_accuracy*100:.4f}% \
                - val_loss: {val_loss:.4f} - val_accuracy: {val_accuracy*100:.4f}%')
            # print(self.model.state_dict())
            model_states.append(copy.deepcopy(self.model.state_dict()))
            
            if(train_accuracy>max_train_acc): max_train_acc=train_accuracy
            if(val_accuracy>max_val_acc): max_val_acc=val_accuracy
        return max_train_acc, max_val_acc
            

    #train in one epoch, return the train_acc, train_loss
    def train_one_epoch(self):
        self.model.train()
        running_loss, correct, total = 0.0, 0, 0
        for i, data in enumerate(self.train_loader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_accuracy = correct / total
        train_loss = running_loss / self.num_batches
        return train_loss, train_accuracy

    #evaluate on a loader and return the loss and accuracy
    def evaluate(self, loader):
        self.model.eval()
        loss, correct, total = 0.0, 0, 0
        with torch.no_grad():
            for data in loader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        loss = loss / len(self.val_loader)
        return loss, accuracy
    
    #evaluate on a loader and return the loss and accuracy
    def evaluate_withprobs(self, loader):
        self.model.eval()
        loss, correct, total = 0.0, 0, 0
        probs = np.empty([0])
        with torch.no_grad():
            for data in loader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = torch.softmax(self.model(inputs),dim=1)
                outputs_np = outputs.cpu().detach().numpy()[:,1]
                probs = np.append(probs, outputs_np)
                loss = self.criterion(outputs, labels)
                loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        loss = loss / len(self.val_loader)
        return loss, accuracy, probs

    #return the val_acc, val_loss, be called at the end of each epoch
    def validate_one_epoch(self):
      val_loss, val_accuracy = self.evaluate(self.val_loader)
      return val_loss, val_accuracy

In [64]:
class LSTM1(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout_1 = nn.Dropout(0.1)
        self.lstm = nn.LSTM(1, 25, num_layers=1, batch_first=True)
        # self.gru = nn.GRU(1, 25, num_layers=1, batch_first=True)
        # self.fc = nn.Sequential(nn.Linear(25, 2), nn.Softmax(dim=1))
        self.dropout_2 = nn.Dropout(0.1)
        self.fc = nn.Linear(25, 2)

    def forward(self, x):
        # Insert your code here
        # print(x)
        x = self.dropout_1(x)
        _, (h_n, _) = self.lstm(x)
        # _, h_n = self.gru(x)
        # print(h_n.shape)
        x = h_n.reshape(h_n.shape[1], -1)
        # print(x.shape)
        x = self.dropout_2(x)
        x = self.fc(x)

        return x

In [65]:
import numpy as np
import pandas as pd
import glob
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

def calculate_daily_returns(prices):
    returns = (prices[1:] - prices[:-1]) / prices[:-1]
    return returns

def create_sliding_windows_with_returns(price_data, window_size=240, start_idx=0, end_idx=None):
    x_data = []
    y_data = []
    y_prices = []
    
    if end_idx is None:
        end_idx = len(price_data) - window_size  

    for i in range(start_idx, end_idx):
        window = price_data[i:i + window_size + 1]  
        # print(window)
        # print(window[-1])
        # print(len(price_data))
        # print(i)
        # print(window_size)
        # print(price_data[i + window_size])
        y_prices.append((window[-1],price_data[i + window_size + 1]))
        returns = (window[1:] - window[:-1]) / window[:-1]
        x_data.append(returns.reshape(-1, 1))
        # print((price_data[i + window_size + 1],price_data[i + window_size]))
        y_value = (price_data[i + window_size + 1]-price_data[i + window_size])/price_data[i + window_size]
        # print(y_value)
        y_data.append(y_value)
    # print(y_prices)
    
    return np.array(x_data), np.array(y_data), np.array(y_prices)

def create_sliding_windows_with_returns_volume(price_data, volume_data, window_size=240, start_idx=0, end_idx=None):
    x_data = []
    y_data = []
    y_prices = []
    
    if end_idx is None:
        end_idx = len(price_data) - window_size  

    for i in range(start_idx, end_idx):
        window = price_data[i:i + window_size + 1]
        v_window = volume_data[i+1:i + window_size + 1]
        # print(windows[-1])
        # print(price_data[i + window_size])
        y_prices.append((window[-1],price_data[i + window_size + 1]))
        returns = (window[1:] - window[:-1]) / window[:-1]

        # print(len(returns))
        # print(len(v_window))
        v_window = np.append(returns.reshape(-1, 1), v_window.reshape(-1, 1), axis=1)
        x_data.append(v_window)
        # print(np.array(x_data).shape)
        y_value = (price_data[i + window_size + 1]-price_data[i + window_size])/price_data[i + window_size]
        y_data.append(y_value)
    
    return np.array(x_data), np.array(y_data), np.array(y_prices)

def process_multiple_stocks(file_path_pattern, window_size=240, ymd=20000101):
    all_files = glob.glob(file_path_pattern)  
    all_x_train = []
    all_y_train = []
    all_x_test = []
    all_y_test = []
    y_train_returns = []
    y_test_returns = []
    all_y_prices = []


    fbm=['0012', '7054', '5238', '7086', '2488', '7131', '0218', '7120', '5281', '7191', '9148', '7146', '0181', '6599', '5139', '5185', '5198', '7145', '7315', '7090', '4952', '0122', '5099', '5014', '2682', '2658', '7609', '5115', '5116', '2674', '1163', '5269', '1015', '5293', '0159', '5120', '1007', '7031', '6351', '7083', '4758', '0048', '6556', '9342', '5568', '5088', '5194', '5015', '6432', '0119', '7214', '7181', '7007', '5210', '5127', '1481', '0068', '7722', '7129', '4057', '0105', '7162', '6399', '0072', '8176', '7048', '5130', '7099', '8885', '7579', '6888', '5106', '2305', '5021', '7078', '0098', '7251', '4162', '5248', '6602', '0187', '6173', '5190', '9814', '8133', '7005', '5258', '0195', '6998', '0179', '5032', '3239', '3395', '5196', '4219', '6025', '1562', '1899', '5069', '0168', '9288', '7036', '6297', '5254', '5100', '5932', '9474', '8761', '9938', '7221', '2771', '0011', '7188', '1818', '7174', '0191', '7154', '7128', '5105', '5229', '0163', '5257', '2836', '7076', '2925', '7035', '2879', '5195', '8982', '8435', '8044', '8052', '7209', '7187', '5738', '5273', '1929', '5007', '5797', '7016', '2828', '1023', '7117', '7018', '5180', '2852', '7986', '5104', '5188', '5071', '7205', '7195', '2127', '5136', '5037', '0102', '8591', '6718', '7202', '5094', '0051', '5049', '9423', '7157', '5082', '5184', '7204', '8125', '5276', '8338', '0091', '5141', '7179', '3484', '7119', '5132', '7212', '0152', '0131', '7277', '6947', '0029', '7528', '5908', '3026', '4456', '7114', '5835', '5265', '7169', '7148', '0205', '7198', '1619', '5216', '7233', '3948', '7165', '5178', '3417', '0154', '2097', '5259', '2143', '3557', '0059', '5253', '8206', '5036', '7471', '1368', '0107', '0064', '0065', '8907', '5081', '5208', '7182', '8877', '9016', '5228', '0090', '7217', '9091', '6076', '7149', '5056', '8613', '7773', '0190', '0100', '6815', '7208', '7094', '5101', '7249', '5283', '3689', '2984', '7047', '5029', '6041', '7229', '2755', '5222', '6939', '0149', '8605', '3107', '0150', '9318', '5197', '0116', '0157', '5277', '9172', '7210', '0128', '9377', '7184', '9261', '5398', '5209', '5226', '5102', '5592', '0198', '0078', '0104', '4715', '2291', '3182', '7197', '5079', '0039', '0021', '3204', '7382', '5020', '5220', '9962', '0045', '1147', '0074', '7192', '2135', '7096', '0082', '5649', '0056', '0208', '0136', '7022', '3247', '7676', '1503', '7668', '7253', '3034', '2062', '5008', '7501', '5168', '5187', '7105', '3255', '5121', '4324', '5095', '5151', '3298', '0175', '0160', '5072', '5199', '7033', '8443', '5819', '5274', '1082', '3301', '0188', '5169', '5165', '5160', '7010', '5291', '6238', '5138', '0185', '5028', '2739', '5062', '7013', '5000', '5024', '8478', '9601', '4251', '5084', '5108', '5255', '9113', '9687', '0174', '0023', '5606', '5227', '5225', '3336', '2216', '5268', '5614', '7222', '7243', '0166', '2607', '0094', '5295', '0147', '6262', '3379', '0192', '1961', '5249', '5673', '5107', '7183', '8834', '0010', '0209', '9393', '5175', '1589', '7043', '7223', '0024', '4723', '8648', '7152', '0058', '5161', '8931', '9083', '0146', '0127', '8923', '6769', '3441', '7167', '4383', '0111', '0193', '8672', '0170', '5247', '0054', '7216', '7199', '7323', '6483', '7161', '9334', '0143', '6491', '0151', '0036', '6203', '7062', '0210', '6211', '5371', '5171', '5060', '5280', '9466', '2445', '2453', '5027', '7164', '5035', '6971', '7017', '7153', '0002', '5878', '7130', '7077', '5843', '9121', '1996', '0176', '3476', '5038', '5192', '5172', '4847', '0180', '6874', '2186', '6572', '8362', '3174', '0018', '1643', '7006', '9385', '9326', '8494', '5789', '5284', '9628', '8079', '5232', '8745', '7170', '6633', '3573', '7089', '4235', '8486', '6645', '0182', '7126', '8303', '8621', '9881', '7085', '5068', '5143', '0075', '9199', '5078', '0017', '1198', '0140', '7617', '7087', '3859', '8583', '5264', '4936', '6181', '1058', '3514', '5098', '7029', '0189', '5236', '6012', '1155', '5077', '5886', '5152', '5983', '1171', '7189', '7004', '3794', '0167', '5182', '4502', '5090', '0081', '5129', '3778', '1694', '5223', '8192', '5040', '7234', '0207', '6149', '3069', '3662', '7595', '0155', '5186', '5026', '5286', '0126', '5001', '0112', '7935', '5166', '7219', '5576', '3816', '9571', '8141', '6114', '8893', '0085', '0034', '2194', '0113', '0103', '6459', '0156', '6548', '5237', '3867', '5123', '0070', '1651', '5916', '5202', '5011', '0213', '5924', '0092', '0043', '3883', '5085', '9539', '5703', '3891', '3913', '3905', '5087', '0138', '5275', '0041', '0108', '0206', '5073', '9806', '4707', '0020', '0096', '7241', '7060', '7139', '7215', '0083', '0201', '0026', '5047', '2038', '5066', '5025', '4944', '5533', '0172', '0049', '7071', '7107', '5827', '7140', '3018', '9008', '0035', '0040', '4006', '0079', '5065', '5053', '0053', '0153', '5260', '6009', '7225', '7052', '3719', '8419', '5125', '5022', '9407', '1724', '5657', '0022', '6912', '0177', '5212', '5041', '1295', '6068', '5183', '6254', '5622', '5271', '5231', '5133', '9997', '7160', '7108', '7080', '5436', '8311', '5219', '5681', '6033', '3042', '3611', '7081', '4464', '7201', '7095', '0006', '1902', '7163', '0171', '7055', '5075', '9695', '7172', '4081', '8869', '1287', '6637', '5080', '7088', '8117', '4634', '4065', '7190', '8273', '5204', '9873', '7168', '0123', '8346', '8966', '5070', '9598', '0038', '0186', '0007', '6807', '7134', '7123', '7237', '0217', '0196', '7084', '7544', '7498', '5272', '7765', '9296', '5256', '0032', '7232', '0173', '0200', '9946', '0106', '0037', '9954', '0202', '1066', '5278', '9741', '5113', '5270', '7803', '2542', '5134', '8567', '0183', '9822', '5147', '0133', '7811', '5218', '4596', '5252', '5157', '2569', '5207', '5170', '0109', '0158', '7247', '0161', '9237', '0099', '4731', '7239', '7158', '7045', '0001', '0028', '2224', '0212', '7073', '4286', '5145', '5163', '0178', '7053', '9792', '5250', '5205', '5279', '7180', '0055', '5517', '4316', '7412', '6017', '5181', '7246', '4197', '5285', '5288', '8532', '9431', '7115', '7155', '7248', '0215', '9776', '0203', '4375', '7132', '0117', '0169', '7943', '5213', '5242', '0093', '5126', '0216', '7103', '8664', '0129', '5665', '6084', '5006', '0080', '6904', '7207', '1201', '5263', '5176', '3743', '5211', '0148', '7235', '7106', '6521', '5135', '7186', '9717', '7082', '1538', '5173', '0050', '7228', '4898', '5012', '4022', '7211', '5158', '6139', '2259', '8524', '5191', '2429', '5149', '5140', '4448', '0211', '7097', '4405', '0132', '2054', '0084', '7439', '5289', '7200', '5347', '7252', '8702', '0089', '0145', '9369', '7034', '9075', '7206', '5112', '7889', '7374', '7079', '5031', '7854', '5239', '4863', '0101', '8397', '7218', '7230', '7285', '5010', '7113', '7173', '7176', '5054', '0199', '0118', '5401', '9059', '5042', '5230', '5167', '4359', '5111', '7100', '0005', '5148', '7133', '2593', '7137', '7227', '4588', '7091', '5005', '5200', '5110', '7757', '2089', '5292', '7250', '0060', '5243', '4995', '7240', '0120', '0097', '0069', '7070', '6963', '0066', '5162', '7203', '5016', '5142', '7226', '3565', '9679', '0197', '7231', '0162', '7692', '0008', '0141', '6378', '7050', '7025', '5246', '5009', '4243', '7245', '5156', '7121', '0095', '5267', '0165', '7003', '0025', '5584', '0086', '5048', '7293', '7020', '7014', '3158', '5159', '7066', '7178', '4677', '6742', '5109', '7028', '2283', '5131']
    # fbm_has1994=['5185', '1619', '5398', '1082', '1066', '2445', '3255', '8664', '3689', '4863', '2488', '1015', '4162', '1562', '1651', '5258', '4197', '3867', '1961', '3816', '1295', '4065', '2089', '1155', '1023', '5347', '3336', '4707', '4731', '3794', '4715', '5126', '4677', '3069', '1171', '5053', '3182']
    count = 0
    latest_date = 0
    dates = []
    for file in all_files:
        # if(count>1): continue
        ticker = file.split('_')[-1][0:-4]
        if ticker not in fbm: continue
        # fbm_100.remove(ticker)



        df = pd.read_csv(file)
        # print(df['<date>'][0])
        if(df['<date>'][0]>ymd): continue
        count+=1
        df=df[df['<date>']>=ymd] # start from 1/1/2020
        date = df['<date>'].values
        close_prices = df['<close>'].values 
        vol = df['<vol>'].values
        # dates.append(date[0])
        # if(date[0]<19940104): fbm_has1994.append(ticker)
        # print(date[0])
        # continue
        # print(ticker)
        
        # x_train, y_train, _ = create_sliding_windows_with_returns_volume(close_prices, vol, window_size, start_idx=0, end_idx=510)
        # x_test, y_test, y_price = create_sliding_windows_with_returns_volume(close_prices, vol, window_size, start_idx=510, end_idx=760)

        x_train, y_train, _ = create_sliding_windows_with_returns(close_prices, window_size, start_idx=0, end_idx=510)
        x_test, y_test, y_price = create_sliding_windows_with_returns(close_prices, window_size, start_idx=510, end_idx=760)

        # print(y_train)
        # print(x_train.shape)

        all_x_train.append(x_train)
        all_y_train.append(y_train)
        all_x_test.append(x_test)
        all_y_test.append(y_test)
        all_y_prices.append(y_price)

        y_train_returns.append(y_train)
        y_test_returns.append(y_test)


        # if len(close_prices) >= 1001:  

        #     x_train, y_train = create_sliding_windows_with_returns(close_prices, window_size, start_idx=0, end_idx=510)
        #     x_test, y_test = create_sliding_windows_with_returns(close_prices, window_size, start_idx=510, end_idx=760)

  
        #     all_x_train.append(x_train)
        #     all_y_train.append(y_train)
        #     all_x_test.append(x_test)
        #     all_y_test.append(y_test)

 
        #     y_train_returns.append(y_train)
        #     y_test_returns.append(y_test)
        # else:
        #     print(f"file {file} has been skipped, because column 'close' has less data than 1001 days")
    # print(count)
    # print(fbm_100) # deal with this later, missing
    # print(sorted(dates))
    # print(fbm_has1994)

    # print(all_y_train)
    
    all_x_train = np.concatenate(all_x_train, axis=0)
    all_y_train = np.concatenate(all_y_train, axis=0)
    all_x_test = np.concatenate(all_x_test, axis=0)
    all_y_test = np.concatenate(all_y_test, axis=0)
    all_y_prices = np.concatenate(all_y_prices, axis=0)

    y_train_returns = np.stack(y_train_returns, axis=1)  
    y_test_returns = np.stack(y_test_returns, axis=1) 

    median_train_returns = np.median(y_train_returns, axis=1) 
    median_test_returns = np.median(y_test_returns, axis=1)  

    # print(y_train_returns.shape)
    # print(len(median_train_returns))
    y_train_labels = []
    y_test_labels = []
    
    for y_day in y_train_returns:
        num = len(y_day)
        y_args = np.argsort(y_day)
        y_args_sorted = np.array_split(y_args,2)
        y_lab = np.zeros_like(y_day)
        y_lab[y_args_sorted[0]] = 0
        y_lab[y_args_sorted[1]] = 1
        y_train_labels.append(y_lab)
        
    for y_day in y_test_returns:
        num = len(y_day)
        y_args = np.argsort(y_day)
        y_args_sorted = np.array_split(y_args,2)
        y_lab = np.zeros_like(y_day)
        y_lab[y_args_sorted[0]] = 0
        y_lab[y_args_sorted[1]] = 1
        y_test_labels.append(y_lab)

    
    y_train_labels = np.array(y_train_labels).flatten('F')
    y_test_labels = np.array(y_test_labels).flatten('F')

    # y_train_labels = np.array([1 if y > median_train_returns[i % len(median_train_returns)] else 0 for i, y in enumerate(all_y_train)])
    # y_test_labels = np.array([1 if y > median_test_returns[i % len(median_test_returns)] else 0 for i, y in enumerate(all_y_test)])
    # print(len(y_train_labels))
    
    return all_x_train, y_train_labels, all_x_test, y_test_labels, all_y_prices, count


def save_data_to_npy(x_data, y_data, x_filename='x_data.npy', y_filename='y_data.npy'):
    np.save(x_filename, x_data)
    np.save(y_filename, y_data)
    print(f"x_data and y_data saved as {x_filename} and {y_filename}")



In [66]:
def calc_returns(test_price_data, probs, k=10):
    # Assuming every row of data corresponds to a separate list of line items: t-1 closing price, t closing price
    
    # print(probs)
    days = 250
    stock_num = int(len(probs)/days)
    returns = 0.0
    for d in range(days):
        start_d = stock_num*d
        probs_d = probs[start_d:start_d+stock_num]
        prices_d = test_price_data[start_d:start_d+stock_num]
        # print(len(probs_d))
        probs_args = np.argsort(probs_d)
        low_10 = probs_args[0:k]
        high_10 = probs_args[-k:]
        # print(f'Lowest {k}: {probs_d[low_10]}')
        # print(f'Highest {k}: {probs_d[high_10]}')
        returns_d = 0.0

        for i in low_10:
            prev_close_price, next_close_price = prices_d[i]
            # print(prev_close_price)
            # next_close_price = test_price_data[(250*d)+i, 1]
            # print(next_close_price)
            returns_d += (prev_close_price - next_close_price) / prev_close_price # earnings per dollar
            # print(returns)
        for i in high_10:
            prev_close_price, next_close_price = prices_d[i]
            # prev_close_price = test_price_data[(250*d)+i, 0]
            # next_close_price = test_price_data[(250*d)+i, 1]
            returns_d += (next_close_price - prev_close_price) / prev_close_price # earnings per dollar
        returns += (returns_d/(k*2))
    return returns

In [67]:
file_path_pattern = r'./dataset/*.csv'  
patience = 10
k=10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
seq_length=240

all_tot_returns=0.0
yrs = range(1995,2020)

all_train_acc = []
all_val_acc = []
all_test_acc = []
all_ticker_count = []

for yr in yrs:
    print(f'Year: {yr}')
    x_train, y_train_labels, x_test, y_test_labels, y_test_prices, ticker_count = process_multiple_stocks(file_path_pattern, seq_length, yr*10000+101)  
    print(f'ticker count: {ticker_count}')
    print(f'x_train shape: {x_train.shape}')
    print(f'y_train_labels shape: {y_train_labels.shape}')
    print(f'x_test shape: {x_test.shape}')
    print(f'y_test_labels shape: {y_test_labels.shape}')
    print(f'y_test_prices shape: {y_test_prices.shape}')
    all_ticker_count.append(ticker_count)
    
    unique_train, counts_train = np.unique(y_train_labels, return_counts=True)
    label_counts_train = dict(zip(unique_train, counts_train))
    print(f"Label count: {label_counts_train}")

    x_train_data = torch.tensor(x_train, dtype=torch.float32)
    y_train_data = torch.tensor(y_train_labels, dtype=torch.long)
    x_test_data = torch.tensor(x_test, dtype=torch.float32)
    y_test_data = torch.tensor(y_test_labels, dtype=torch.long)
    
    # y_train_data = torch.randint(0,2,(len(y_train_data),)) # test random y label)
    
    all_train_dataset = TensorDataset(x_train_data, y_train_data)
    N = len(all_train_dataset)
    N_train = int(0.8*N)
    N_val = N - N_train
    
    train_dataset, valid_dataset = torch.utils.data.random_split(all_train_dataset, [N_train, N_val])
    test_dataset = TensorDataset(x_test_data, y_test_data)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    # lstm = LSTM1().to(device)
    lstm = LSTM1().to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.RMSprop(lstm.parameters(), lr=0.001)
    
    # print(lstm)
    trainer = BaseTrainer(model=lstm, criterion=criterion, optimizer=optimizer, train_loader=train_loader, val_loader=valid_loader)
    max_train_acc, max_val_acc = trainer.fit_withpatience(num_epochs=1000, patience=patience)
    # trainer.fit(num_epochs=1000)

    all_train_acc.append(max_train_acc)
    all_val_acc.append(max_val_acc)
    
    test_loss, test_acc, probs = trainer.evaluate_withprobs(test_loader)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

    tot_returns=calc_returns(y_test_prices, probs, k)*100/250
    print(f'Total returns: {tot_returns}\n')
    all_tot_returns+=tot_returns

    all_test_acc.append(test_acc)
print(f'Average ticker count: {np.mean(all_ticker_count)}')
print(f'Average total returns: {all_tot_returns/len(yrs):.4f}')
print(f'Average max train accuracy: {np.mean(all_train_acc):.4f}')
print(f'Average max val accuracy: {np.mean(all_val_acc):.4f}')
print(f'Average test accuracy: {np.mean(all_test_acc):.4f}')

Using device: cuda
Year: 1995
ticker count: 227
x_train shape: (115770, 240, 1)
y_train_labels shape: (115770,)
x_test shape: (56750, 240, 1)
y_test_labels shape: (56750,)
y_test_prices shape: (56750, 2)
Label count: {0.0: 58140, 1.0: 57630}
Epoch 1/1000
total time for each epoch 10.175489902496338
2895/2895 - train_loss: 0.6933 - train_accuracy: 50.0615%                 - val_loss: 0.0019 - val_accuracy: 50.3153%
Epoch 2/1000
total time for each epoch 9.982332229614258
2895/2895 - train_loss: 0.6932 - train_accuracy: 50.1630%                 - val_loss: 0.0019 - val_accuracy: 50.4060%
Epoch 3/1000
total time for each epoch 11.283515930175781
2895/2895 - train_loss: 0.6932 - train_accuracy: 50.2559%                 - val_loss: 0.0019 - val_accuracy: 49.6631%
Epoch 4/1000
total time for each epoch 11.788401126861572
2895/2895 - train_loss: 0.6932 - train_accuracy: 50.2591%                 - val_loss: 0.0019 - val_accuracy: 49.8229%
Epoch 5/1000
total time for each epoch 12.3523952960968

In [2]:
!unzip history_bursa_all.zip -d dataset

Archive:  history_bursa_all.zip
 extracting: dataset/h_TMC1O_48631O.csv  
 extracting: dataset/h_GENETECCC_0104CC.csv  
 extracting: dataset/h_BATC31_416231.csv  
 extracting: dataset/h_TAWIN_7097.csv  
 extracting: dataset/h_99SMARTCC_5326CC.csv  
 extracting: dataset/h_TWLWD_7079WD.csv  
 extracting: dataset/h_BAUTOC40_524840.csv  
 extracting: dataset/h_MAXISC49_601249.csv  
 extracting: dataset/h_ECONBHD_5253.csv  
 extracting: dataset/h_MESB_7234.csv  
 extracting: dataset/h_AFFIN_5185.csv  
 extracting: dataset/h_CGBWB_8052WB.csv  
 extracting: dataset/h_ENRA_8613.csv  
 extracting: dataset/h_LBS_5789.csv  
 extracting: dataset/h_MLABWD_0085WD.csv  
 extracting: dataset/h_HSICYE_0651YE.csv  
 extracting: dataset/h_UEMSC1H_51481H.csv  
 extracting: dataset/h_UEMSC1J_51481J.csv  
 extracting: dataset/h_MCEHLDGWA_7004WA.csv  
 extracting: dataset/h_OVERSEAWB_0153WB.csv  
 extracting: dataset/h_PLABS_0171.csv  
 extracting: dataset/h_RL_0219.csv   
 extracting: dataset/h_KLCC_5235SS.