In [18]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings
import torch.optim as optim
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch.utils.data as data
from torchvision import datasets, transforms
warnings.filterwarnings('ignore')

In [19]:
# form the training and testing dataset
class DataLoader():
    IMAGE_WIDTH = 60
    IMAGE_HEIGHT = 64
    TRAIN_START = 2008
    TRAIN_END = 2010
    TEST_START = 2001
    TEST_END = 2019
    rt_day = 5 # number of days for forward looking
    train_data = []
    test_data = []
    positive_count = 0
    negative_count = 0

    def make_training_data(self):
        for year in tqdm(range(self.TRAIN_START, self.TRAIN_END)):
            
            img_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat")
            images = np.memmap(img_path, 
                               dtype=np.uint8, 
                               mode='r').reshape((-1, self.IMAGE_HEIGHT, self.IMAGE_WIDTH))
            # print(images.shape)
            label_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")
            label_df = pd.read_feather(label_path)
            label_array = np.where(label_df[f"Ret_{self.rt_day}d"] > 0, 1, 0)
            # print(label_array.shape)
            for i in range(images.shape[0]):
                self.train_data.append([np.array(images[i]), np.eye(2)[label_array.item(i)]])
            print(f"{year} data finished loading.")

        np.random.shuffle(self.train_data)
        np.save("train_data.npy", self.train_data)

    def make_testing_data(self):
        for year in tqdm(range(self.TEST_START, self.TEST_START+1)):
            img_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat")
            images = np.memmap(img_path, 
                               dtype=np.uint8, 
                               mode='r').reshape((-1, self.IMAGE_HEIGHT, self.IMAGE_WIDTH))
            label_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")
            label_df = pd.read_feather(label_path)
            label_array = np.where(label_df[f"Ret_{self.rt_day}d"] > 0, 1, 0)
            for i in range(images.shape[0]):
                self.test_data.append([np.array(images[i]), np.eye(2)[label_array.item(i)]])
            print(f"{year} data finished loading.")

        # np.random.shuffle(self.train_data)
        np.save("test_data.npy", self.test_data)

In [20]:
data = DataLoader()
data.make_training_data()

training_data = np.load("train_data.npy", allow_pickle=True)
print(len(training_data))
print(training_data[0][0], training_data[0][1])


 50%|█████     | 1/2 [00:01<00:01,  1.05s/it]

2008 data finished loading.


100%|██████████| 2/2 [00:01<00:00,  1.23it/s]

2009 data finished loading.





148251
[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0 255   0]
 [  0   0   0 ...   0 255   0]
 [  0 255   0 ...   0 255   0]] [1. 0.]


In [21]:
# from matplotlib import pyplot as plt

# for i in range(5):
#     print(training_data[i][0], training_data[i][1])
#     plt.imshow(training_data[i][0], cmap='gray')
#     plt.show()



In [22]:
data = DataLoader()
data.make_testing_data()

testing_data = np.load("test_data.npy", allow_pickle=True)
print(len(testing_data))
print(testing_data[0][0], testing_data[0][1])

100%|██████████| 1/1 [00:01<00:00,  1.36s/it]

2001 data finished loading.





91985
[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0 255   0 ...   0 255   0]] [1. 0.]


In [29]:
# construct CNN model layers
class ConvNet_20day(nn.Module):
    def __init__(self, num_classes=2):
        super(ConvNet_20day, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(5, 3), stride=(3, 1), dilation=(2, 1), padding=(8, 1)),
            # nn.Conv2d(1, 64, kernel_size=(5, 3), padding=(2, 1)),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)))

        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=(5, 3), padding=(2, 1)),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)))
            
        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=(5, 3), padding=(2, 1)),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)))
            
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                m.weight.data = nn.init.xavier_uniform(m.weight.data, 
                                                       gain=nn.init.calculate_gain('leaky_relu')) 
        
        self.fc = nn.Linear(46080, num_classes)
        # self.fc = nn.Linear(122880, num_classes)
        self.drop = nn.Dropout1d(p=0.5)

        

        
    def forward(self, x):
        # print(x.shape)
        out = self.layer1(x)
        # print(out.shape)
        out = self.layer2(out)
        # print(out.shape)
        out = self.layer3(out)
        print(out.shape, out.size(0))
        out = out.reshape(out.size(0), -1)
        print('dimention after reshape',out.shape)
        out = self.fc(out)
        out = self.drop(out)
        out = F.softmax(out, dim=1)

        return out

In [24]:
def validation(model, device, valid_X, valid_y, loss_function):

    model.eval()
    loss_total = 0

    with torch.no_grad():
        for i in tqdm(range(len(valid_X))):
            label = torch.argmax(valid_y[i])
            output = model(valid_X[i].view(-1,1,64,60))[0]
            loss = loss_function(output, label)
            loss_total += loss.item()

    return loss_total / len(valid_X)

In [25]:
def traindata(device, model, epochs, batch_size, optimizer, loss_function, train_X, train_y, valid_X, valid_y):
    # Early stopping parameters
    last_loss = 100
    patience = 0
    triggertimes = 0

    for epoch in range(epochs): 
        model.train()
        for times in range(0, len(train_X), batch_size):
            batch_X = train_X[times:times+batch_size].view(-1, 1, 64, 60)
            batch_y = train_y[times:times+batch_size]

            optimizer.zero_grad()
            output = model(batch_X)
            print(output[0], batch_y[0])
            loss = loss_function(output, batch_y)
            loss.backward()
            optimizer.step()
            if times % 1280 == 0 or times == len(train_X):
                print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, times, len(train_X), 
                       loss.item()))

        # Early stopping
        current_loss = validation(model, device, valid_X, valid_y, loss_function)
        print('The Current Loss:', current_loss)

        if current_loss > last_loss:
            trigger_times += 1
            print('Trigger Times:', trigger_times)
            if trigger_times >= patience:
                print('Early stopping!\nStart to test process.')
                return model

        else:
            print('trigger times: 0')
            trigger_times = 0
            return model

        last_loss = current_loss

    return model



In [26]:
def test(device, model, test_X, test_y):

    model.eval()
    total = 0
    correct = 0
    
    with torch.no_grad():
        for i in tqdm(range(len(test_X))):
            real_class = torch.argmax(test_y[i])
            output = model(test_X[i].view(-1,1,64,60))[0]
            predicted_class = torch.argmax(output)
            if predicted_class == real_class:
                correct += 1
            total += 1

    print('Accuracy:', correct / total)

In [27]:
def main():
    # GPU device
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print('Device state:', device)

    epochs = 100
    batch_size = 128
    lr = 1e-5
    loss_function = nn.CrossEntropyLoss()
    model = ConvNet_20day().to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    # optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0)


    valid_size = int(len(training_data) * 0.3)

    X = np.array([i[0] for i in training_data])
    X = torch.Tensor(X).view(-1, 64, 60)
    X = X / 255.0
    y = torch.Tensor([i[1] for i in training_data])
    train_X, train_y = X[:-valid_size], y[:-valid_size]
    valid_X, valid_y = X[-valid_size:], y[-valid_size:]

    test_X = np.array([i[0] for i in testing_data])
    test_X = torch.Tensor(test_X).view(-1, 64, 60)
    test_X = test_X / 255.0
    test_y = torch.Tensor([i[1] for i in testing_data])

    

    # check if the training set is balances
    positive = [i for i in train_y if i[0] == 0]
    print(f"{len(train_y)} training samples with {len(positive)} positive return and " \
          f"{len(train_y) - len(positive)} negative return. \nTraining set is well-balanced.")        

    # trainloader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    # testloader = data.DataLoader(testing_data, batch_size=batch_size, shuffle=False)
    # validloader = data.DataLoader(validset, batch_size=batch_size, shuffle=True)

    # Train
    model = traindata(device, model, epochs, batch_size, optimizer, loss_function, 
                      train_X, train_y, valid_X, valid_y)
    # Test
    test(device, model, valid_X, valid_y)


# if __name__ == '__main__':
#     main()

In [30]:
if __name__ == '__main__':
    main()

Device state: cpu
103776 training samples with 50627 positive return and 53149 negative return. 
Training set is well-balanced.
torch.Size([128, 256, 3, 60]) 128
dimention after reshape torch.Size([128, 46080])
tensor([0.1310, 0.8690], grad_fn=<SelectBackward0>) tensor([1., 0.])
[0/100, 0/103776] loss: 0.70078975
torch.Size([128, 256, 3, 60]) 128
dimention after reshape torch.Size([128, 46080])
tensor([0.5000, 0.5000], grad_fn=<SelectBackward0>) tensor([1., 0.])
torch.Size([128, 256, 3, 60]) 128
dimention after reshape torch.Size([128, 46080])
tensor([0.5000, 0.5000], grad_fn=<SelectBackward0>) tensor([1., 0.])
torch.Size([128, 256, 3, 60]) 128
dimention after reshape torch.Size([128, 46080])
tensor([0.5000, 0.5000], grad_fn=<SelectBackward0>) tensor([0., 1.])
torch.Size([128, 256, 3, 60]) 128
dimention after reshape torch.Size([128, 46080])
tensor([0.2709, 0.7291], grad_fn=<SelectBackward0>) tensor([0., 1.])
torch.Size([128, 256, 3, 60]) 128
dimention after reshape torch.Size([128, 46

KeyboardInterrupt: 