In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings
import torch.optim as optim
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch.utils.data as data
from torchvision import datasets, transforms
warnings.filterwarnings('ignore')

In [3]:
# form the training and testing dataset
class DataLoader():
    IMAGE_WIDTH = 60
    IMAGE_HEIGHT = 64
    TRAIN_START = 1993
    TRAIN_END = 2000
    TEST_START = 2001
    TEST_END = 2019
    rt_day = 5 # number of days for forward looking
    train_data = []
    test_data = []
    positive_count = 0
    negative_count = 0

    def make_training_data(self):
        for year in tqdm(range(self.TRAIN_START, self.TRAIN_END+1)):
            
            img_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat")
            images = np.memmap(img_path, 
                               dtype=np.uint8, 
                               mode='r').reshape((-1, self.IMAGE_HEIGHT, self.IMAGE_WIDTH))
            # print(images.shape)
            label_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")
            label_df = pd.read_feather(label_path)
            label_array = np.where(label_df[f"Ret_{self.rt_day}d"] > 0, 1, 0)
            # print(label_array.shape)
            for i in range(images.shape[0]):
                self.train_data.append([np.array(images[i]), np.eye(2)[label_array.item(i)]])
            print(f"{year} data finished loading.")

        np.random.shuffle(self.train_data)
        np.save("train_data.npy", self.train_data)

    def make_testing_data(self):
        for year in tqdm(range(self.TEST_START, self.TEST_END+1)):
            img_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat")
            images = np.memmap(img_path, 
                               dtype=np.uint8, 
                               mode='r').reshape((-1, self.IMAGE_HEIGHT, self.IMAGE_WIDTH))
            label_path = os.path.join("./monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")
            label_df = pd.read_feather(label_path)
            label_array = np.where(label_df[f"Ret_{self.rt_day}d"] > 0, 1, 0)
            for i in range(images.shape[0]):
                self.test_data.append([np.array(images[i]), np.eye(2)[label_array.item(i)]])
            print(f"{year} data finished loading.")
        np.save("test_data.npy", self.test_data)

In [5]:
data = DataLoader()
data.make_training_data()

training_data = np.load("train_data.npy", allow_pickle=True)
# print(len(training_data))
# print(training_data[0][0], training_data[0][1])


 12%|█▎        | 1/8 [00:00<00:04,  1.47it/s]

1993 data finished loading.


 25%|██▌       | 2/8 [00:01<00:04,  1.38it/s]

1994 data finished loading.


 38%|███▊      | 3/8 [00:02<00:04,  1.20it/s]

1995 data finished loading.


 50%|█████     | 4/8 [00:03<00:03,  1.19it/s]

1996 data finished loading.


 62%|██████▎   | 5/8 [00:04<00:02,  1.04it/s]

1997 data finished loading.


 75%|███████▌  | 6/8 [00:06<00:02,  1.18s/it]

1998 data finished loading.


 88%|████████▊ | 7/8 [00:07<00:01,  1.19s/it]

1999 data finished loading.


100%|██████████| 8/8 [00:07<00:00,  1.05it/s]

2000 data finished loading.





In [34]:
# from matplotlib import pyplot as plt

# for i in range(5):
#     print(training_data[i][0], training_data[i][1])
#     plt.imshow(training_data[i][0], cmap='gray')
#     plt.show()



In [7]:
data = DataLoader()
data.make_testing_data()

testing_data = np.load("test_data.npy", allow_pickle=True)
# print(len(testing_data))
# print(testing_data[0][0], testing_data[0][1])

  5%|▌         | 1/19 [00:00<00:13,  1.34it/s]

2001 data finished loading.


 11%|█         | 2/19 [00:01<00:17,  1.04s/it]

2002 data finished loading.


 16%|█▌        | 3/19 [00:02<00:15,  1.04it/s]

2003 data finished loading.


 21%|██        | 4/19 [00:03<00:14,  1.04it/s]

2004 data finished loading.


 26%|██▋       | 5/19 [00:04<00:13,  1.05it/s]

2005 data finished loading.


 32%|███▏      | 6/19 [00:05<00:12,  1.04it/s]

2006 data finished loading.


 37%|███▋      | 7/19 [00:06<00:12,  1.01s/it]

2007 data finished loading.


 42%|████▏     | 8/19 [00:07<00:11,  1.01s/it]

2008 data finished loading.


 47%|████▋     | 9/19 [00:08<00:09,  1.04it/s]

2009 data finished loading.


 53%|█████▎    | 10/19 [00:09<00:08,  1.09it/s]

2010 data finished loading.


 58%|█████▊    | 11/19 [00:10<00:07,  1.12it/s]

2011 data finished loading.


 63%|██████▎   | 12/19 [00:11<00:05,  1.19it/s]

2012 data finished loading.


 68%|██████▊   | 13/19 [00:11<00:04,  1.23it/s]

2013 data finished loading.


 74%|███████▎  | 14/19 [00:13<00:05,  1.07s/it]

2014 data finished loading.


 79%|███████▉  | 15/19 [00:14<00:03,  1.02it/s]

2015 data finished loading.


 84%|████████▍ | 16/19 [00:15<00:02,  1.07it/s]

2016 data finished loading.


 89%|████████▉ | 17/19 [00:15<00:01,  1.13it/s]

2017 data finished loading.


 95%|█████████▍| 18/19 [00:16<00:00,  1.17it/s]

2018 data finished loading.


100%|██████████| 19/19 [00:17<00:00,  1.09it/s]

2019 data finished loading.





In [8]:
# construct CNN model layers
class ConvNet_20day(nn.Module):
    def __init__(self, num_classes=2):
        super(ConvNet_20day, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(5, 3), stride=(3, 1), dilation=(2, 1), padding=(8, 1)),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)))

        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=(5, 3), padding=(2, 1)),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)))
            
        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=(5, 3), padding=(2, 1)),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)))
            
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                m.weight.data = nn.init.xavier_uniform(m.weight.data, 
                                                       gain=nn.init.calculate_gain('leaky_relu')) 
        
        self.fc = nn.Linear(46080, num_classes)
        self.drop = nn.Dropout1d(p=0.5)

        

        
    def forward(self, x):
        # print(x.shape)
        out = self.layer1(x)
        # print(out.shape)
        out = self.layer2(out)
        # print(out.shape)
        out = self.layer3(out)
        # print(out.shape, out.size(0))
        out = out.reshape(out.size(0), -1)
        # print('dimention after reshape',out.shape)
        out = self.fc(out)
        out = self.drop(out)
        out = F.softmax(out, dim=1)

        return out

In [9]:
def validation(model, device, valid_X, valid_y, loss_function):

    model.eval()
    loss_total = 0

    with torch.no_grad():
        for i in tqdm(range(len(valid_X))):
            label = torch.argmax(valid_y[i])
            output = model(valid_X[i].view(-1,1,64,60))[0]
            loss = loss_function(output, label)
            loss_total += loss.item()

    return loss_total / len(valid_X)

In [10]:
def traindata(device, model, epochs, batch_size, optimizer, loss_function, train_X, train_y, valid_X, valid_y):
    # Early stopping parameters
    last_loss = 100
    patience = 0
    triggertimes = 0

    for epoch in range(epochs): 
        model.train()
        for times in range(0, len(train_X), batch_size):
            batch_X = train_X[times:times+batch_size].view(-1, 1, 64, 60)
            batch_y = train_y[times:times+batch_size]

            optimizer.zero_grad()
            output = model(batch_X)
            # print(output[0], batch_y[0])
            loss = loss_function(output, batch_y)
            loss.backward()
            optimizer.step()
            if times % 1280 == 0 or times == len(train_X):
                print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, times, len(train_X), 
                       loss.item()))

        # Early stopping
        current_loss = validation(model, device, valid_X, valid_y, loss_function)
        print('The Current Loss:', current_loss)

        if current_loss > last_loss:
            trigger_times += 1
            print('Trigger Times:', trigger_times)
            if trigger_times >= patience:
                print('Early stopping!\nStart to test process.')
                return model

        else:
            print('trigger times: 0')
            trigger_times = 0
            return model

        last_loss = current_loss

    return model



In [11]:
def test(device, model, test_X, test_y):

    model.eval()
    total = 0
    correct = 0
    
    with torch.no_grad():
        for i in tqdm(range(len(test_X))):
            real_class = torch.argmax(test_y[i])
            output = model(test_X[i].view(-1,1,64,60))[0]
            predicted_class = torch.argmax(output)
            if predicted_class == real_class:
                correct += 1
            total += 1

    print('Accuracy:', correct / total)

In [12]:
def main():
    # GPU device
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print('Device state:', device)

    epochs = 100
    batch_size = 128
    lr = 1e-5
    loss_function = nn.CrossEntropyLoss()
    model = ConvNet_20day().to(device)
    
    optimizer = optim.SGD(model.parameters(), lr=lr)
    # optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0)


    valid_size = int(len(training_data) * 0.3)

    X = np.array([i[0] for i in training_data])
    X = torch.Tensor(X).view(-1, 64, 60)
    X = X / 255.0
    y = torch.Tensor([i[1] for i in training_data])
    train_X, train_y = X[:-valid_size], y[:-valid_size]
    valid_X, valid_y = X[-valid_size:], y[-valid_size:]

    test_X = np.array([i[0] for i in testing_data])
    test_X = torch.Tensor(test_X).view(-1, 64, 60)
    test_X = test_X / 255.0
    test_y = torch.Tensor([i[1] for i in testing_data])

    

    # check if the training set is balances
    positive = [i for i in train_y if i[0] == 0]
    print(f"{len(train_y)} training samples with {len(positive)} positive return and " \
          f"{len(train_y) - len(positive)} negative return. \nTraining set is well-balanced.")        


    # Train
    model = traindata(device, model, epochs, batch_size, optimizer, loss_function, 
                      train_X, train_y, valid_X, valid_y)
    # Test
    test(device, model, valid_X, valid_y)


In [13]:
# if __name__ == '__main__':
#     main()