## Ceate Dataset

In [1]:
import torch
from torch.utils.data import Dataset

import numpy as np
import csv


class StockDataset(Dataset):
    ''' Dataset for loading and preprocessing the dataset '''
    def __init__(self,
                 path,
                 mode='train'
              ):
        self.mode = mode

        # Read data into numpy arrays
        with open(path, 'r') as fp:
          data = list(csv.reader(fp))
          data = np.array(data[1:])[:, 4:].astype(float)
        
        target = data[:, -1]
        data = data[:, list(range(22))]

        if mode == 'test':
            # Testing data
            self.data = torch.FloatTensor(data)
            self.target = torch.LongTensor(target)
        else:
            # Training data (train/dev sets)
        
            # Splitting training data into train & dev sets
            gap = int(len(data) * 0.8)
            if mode == 'train':
                
                self.data = torch.FloatTensor(data[:gap])
                self.target = torch.LongTensor(target[:gap])
            elif mode == 'dev':
                            
            # Convert data into PyTorch tensors
                self.data = torch.FloatTensor(data[gap:])
                self.target = torch.LongTensor(target[gap:])

        # Normalize features
        self.data = \
            (self.data - self.data.mean(dim=0, keepdim=True)) \
            / self.data.std(dim=0, keepdim=True)

        self.dim = self.data.shape[1]

        # print(f'Finished reading the {mode} set of Dataset ({len(self.data)} samples found, each dim = {self.dim})')

    def __getitem__(self, index):

        return self.data[index], self.target[index]

    def __len__(self):

        return len(self.data)


## Create Model

In [2]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()

        self.layer1 = nn.Linear(22, 512)
        self.layer1_bn=nn.BatchNorm1d(512)

        self.layer2 = nn.Linear(512, 256)
        self.layer2_bn=nn.BatchNorm1d(256)

        self.layer3 = nn.Linear(256, 128)
        self.layer3_bn=nn.BatchNorm1d(128)

        self.layer4 = nn.Linear(128, 64)
        self.layer4_bn=nn.BatchNorm1d(64)

        self.layer5 = nn.Linear(64, 32)
        self.layer5_bn=nn.BatchNorm1d(32)

        # self.drop = nn.Dropout(0.5)

        self.out = nn.Linear(32, 3) 

        # self.act_fn = nn.Sigmoid()
        self.act_fn = nn.ReLU()
        

    def forward(self, x):
        # x = self.drop(x)

        x = self.layer1(x)
        x = self.layer1_bn(x)
        x = self.act_fn(x)
        # x = self.drop(x)

        x = self.layer2(x)
        x = self.layer2_bn(x)
        x = self.act_fn(x)
        # x = self.drop(x)

        x = self.layer3(x)
        x = self.layer3_bn(x)
        x = self.act_fn(x)
        # x = self.drop(x)
 
        x = self.layer4(x)
        x = self.layer4_bn(x)
        x = self.act_fn(x)
        # x = self.drop(x)

        x = self.layer5(x)
        x = self.layer5_bn(x)
        x = self.act_fn(x)
        # x = self.drop(x)

        x = self.out(x)

        
        return x

In [3]:

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score 
plt.ioff()
# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# fix random seed for reproducibility
same_seeds(0)

# get device 
device = 'cpu'
print(f'DEVICE: {device}')

# training parameters
num_epoch = 400        # number of training epoch
BATCH_SIZE = 32
learning_rate = 0.001       # learning rate


def macro_precision(y_true, y_predict):
    hit_count = {0: 0, 1: 0, 2: 0}
    loss_count = 0
    for i, prediction in enumerate(y_predict):
        if y_true[i] == prediction:
            hit_count[prediction] += 1
        elif (y_true[i] == 0 and prediction == 2) or (y_true[i] == 2 and prediction == 0):
            loss_count += 1
    # r1 = hit_count[0]/y_true.count(0)
    # r2 = hit_count[1]/y_true.count(1)
    # r3 = hit_count[2]/y_true.count(2)

    if hit_count[0] == 0:
        p1 = 0
    else:
        p1 = hit_count[0]/y_predict.count(0)
    if hit_count[1] == 0:
        p2 = 0
    else:
        p2 = hit_count[1]/y_predict.count(1)
    if hit_count[2] == 0:
        p3 = 0
    else:
        p3 = hit_count[2]/y_predict.count(2)

    # f1 = 2 * p1 * r1 / (p1 + r1)
    # f2 = 2 * p2 * r2 / (p2 + r2)
    # f3 = 2 * p3 * r3 / (p3 + r3)
    
    # return (p1 + p2 + p3) / 3
    return hit_count[0] + hit_count[2] - loss_count

def plot_learning_curve(acc_record, title, y_label, i , d, limit_y = True):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_epoch = len(acc_record['train'])
    x = range(total_epoch)
    # x_2 = x_1[::len(acc_record['train']) // len(acc_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x, acc_record['train'], c='tab:red', label='train')
    plt.plot(x, acc_record['dev'], c='tab:cyan', label='dev')
    if limit_y:
        plt.ylim(0.0, 1.0)
    plt.xlabel('Training Epoch')
    plt.ylabel(y_label)
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.savefig(f'./models/fig/{y_label}_{i}_{d}d.png')
    plt.close()

DEVICE: cpu


## Training

In [18]:
error_list = []

# for i in range(1, 201):
#     print(f'Model {i}')
#     for d in [10, 20, 60, 120]:
        
for error in redo_list:
        i, d = error
        i = int(i)
        d = int(d[:-1])
        print(f'Model {i}_{d}d')

        try:
            train_set = StockDataset(f'./data/training_data/training_data_{i}_{d}d.csv', mode = 'test')
            val_set = StockDataset(f'./data/training_data/val_data_{i}_{d}d.csv', mode = 'test')
            train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) #only shuffle the training data
            val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

            # the path where checkpoint saved
            model_path = f'./models/models/model_{i}_{d}d.ckpt'

            # create model, define a loss function, and optimizer
            model = Classifier().to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

            # Training

            best_acc = 0.0
            best_macro_precision = 0.0

            acc_record = {'train': [], 'dev': []}
            macro_f1_record = {'train': [], 'dev': []}
            macro_precision_score_record = {'train': [], 'dev': []}

            # init accuracy
            train_acc = 0.0
            val_acc = 0.0

            train_labels = []
            train_predictions = []
            val_labels = []
            val_predictions = []
            with torch.no_grad():
                for data in val_loader:
                    inputs, labels = data
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    _, val_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
                    val_acc += (val_pred.cpu() == labels.cpu()).sum().item() 
                    for y in val_pred.cpu().numpy():
                        val_predictions.append(y)

                    for y in labels.cpu().numpy():
                        val_labels.append(y)
                
                for data in train_loader:
                    inputs, labels = data
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs) 
                    _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
                    train_acc += (train_pred.cpu() == labels.cpu()).sum().item() 
                    for y in train_pred.cpu().numpy():
                        train_predictions.append(y)

                    for y in labels.cpu().numpy():
                        train_labels.append(y)

                acc_record['dev'].append(val_acc/len(val_set))
                acc_record['train'].append(train_acc/len(train_set))
                macro_f1_record['dev'].append(f1_score(val_labels, val_predictions, average='macro'))
                macro_f1_record['train'].append(f1_score(train_labels, train_predictions, average='macro'))
                macro_precision_score_record['dev'].append(macro_precision(val_labels, val_predictions))
                macro_precision_score_record['train'].append(macro_precision(train_labels, train_predictions))
                torch.save(model.state_dict(), model_path)

            # start training
            for epoch in range(num_epoch):
            
                train_acc = 0.0
                train_loss = 0.0
                val_acc = 0.0
                val_loss = 0.0

                train_labels = []
                train_predictions = []
                val_labels = []
                val_predictions = []

                # training
                model.train() # set the model to training mode
                for data in train_loader:
                    inputs, labels = data
                    inputs, labels = inputs.to(device), labels.to(device)
                    optimizer.zero_grad() 
                    outputs = model(inputs) 
                    batch_loss = criterion(outputs, labels)
                    _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
                    batch_loss.backward() # compute gradient
                    optimizer.step() # update model with optimizer

                    train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
                    train_loss += batch_loss.item()
                    for y in train_pred.cpu().numpy():
                        train_predictions.append(y)

                    for y in labels.cpu().numpy():
                        train_labels.append(y)

                
                acc_record['train'].append(train_acc/len(train_set))

                train_f1 = f1_score(train_labels, train_predictions, average='macro')
                macro_f1_record['train'].append(train_f1)

                train_weighted_precision = macro_precision(train_labels, train_predictions)
                macro_precision_score_record['train'].append(train_weighted_precision)

                # validation

                model.eval() # set the model to evaluation mode
                with torch.no_grad():
                    for data in val_loader:
                        inputs, labels = data
                        inputs, labels = inputs.to(device), labels.to(device)
                        outputs = model(inputs)
                        batch_loss = criterion(outputs, labels) 
                        _, val_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
                    
                        val_acc += (val_pred.cpu() == labels.cpu()).sum().item() 
                        val_loss += batch_loss.item()
                        for y in val_pred.cpu().numpy():
                            val_predictions.append(y)

                        for y in labels.cpu().numpy():
                            val_labels.append(y)


                    acc_record['dev'].append(val_acc/len(val_set))

                    val_f1 = f1_score(val_labels, val_predictions, average='macro')
                    macro_f1_record['dev'].append(val_f1)

                    val_weighted_precision = macro_precision(val_labels, val_predictions)
                    macro_precision_score_record['dev'].append(val_weighted_precision)


                    # if the model improves, save a checkpoint at this epoch
                    if val_weighted_precision > best_macro_precision:
                        best_macro_precision = val_weighted_precision
                        torch.save(model.state_dict(), model_path)
                        # print('[{:03d}/{:03d}] Train Acc: {:3.6f} F1: {:.3f} wP: {:.3f} Loss: {:3.6f} | \
                        # Val Acc: {:3.6f} F1: {:.3f} wP: {:.3f} loss: {:3.6f}'
                            # .format(epoch + 1, num_epoch, train_acc/len(train_set), train_f1, train_weighted_precision, train_loss/len(train_loader), 
                            #                             val_acc/len(val_set), val_f1, val_weighted_precision, val_loss/len(val_loader)))
                        # print('saving model with Val Acc {:.3f}'.format(best_acc/len(val_set)))
                        # print('saving model with Val Precision {:.3f}'.format(best_macro_precision))


            plot_learning_curve(acc_record, 'deep model', 'Accuracy', i, d)
            plot_learning_curve(macro_precision_score_record, 'deep model', 'Macro Precision', i, d, limit_y=False)
            plot_learning_curve(macro_f1_record, 'deep model', 'Macro F1 Score', i, d)
        except Exception as e:
            error_list.append([i, d, str(e)])
            


Model 3_20d
Model 11_10d
Model 12_60d
Model 19_120d
Model 23_10d
Model 23_60d
Model 25_10d
Model 25_20d
Model 25_60d
Model 25_120d
Model 26_20d
Model 30_120d
Model 31_10d
Model 31_20d
Model 31_60d
Model 31_120d
Model 36_20d
Model 40_20d
Model 41_120d
Model 46_20d
Model 54_10d
Model 54_20d
Model 56_60d
Model 60_10d
Model 71_60d
Model 73_20d
Model 73_120d
Model 77_20d
Model 77_120d
Model 80_120d
Model 90_60d
Model 97_10d
Model 98_10d
Model 98_20d
Model 103_60d
Model 106_120d
Model 113_20d
Model 113_60d
Model 113_120d
Model 116_120d
Model 118_10d
Model 118_20d
Model 118_120d
Model 125_10d
Model 125_60d
Model 130_20d
Model 134_20d
Model 134_60d
Model 139_60d
Model 141_20d
Model 147_60d
Model 153_60d
Model 154_60d
Model 160_20d
Model 160_60d
Model 160_120d
Model 161_60d
Model 161_120d
Model 166_60d
Model 166_120d
Model 169_60d
Model 172_60d
Model 172_120d
Model 178_20d
Model 185_20d
Model 185_60d
Model 185_120d
Model 186_20d
Model 187_60d
Model 187_120d
Model 190_60d
Model 190_120d
Model 19

In [20]:
error_list

[]

In [6]:
import json
    
with open("errors.json", "w") as outfile:
    json.dump(error_list, outfile)
with open('errors.json') as jsonfile:
    redo_list = json.load(jsonfile)    

In [13]:
import pathlib
l = []
redo_list = []

for path in pathlib.Path('./models/models').glob('*.ckpt'):
    parts = path.stem.split('_')
    l.append([parts[1], parts[2]])

for i in range(1, 201):
    for d in ['10d', '20d', '60d', '120d']:
        model_id = [str((i)), d]
        if model_id not in l:
            redo_list.append(model_id)

[['3', '20d'],
 ['11', '10d'],
 ['12', '60d'],
 ['19', '120d'],
 ['23', '10d'],
 ['23', '60d'],
 ['25', '10d'],
 ['25', '20d'],
 ['25', '60d'],
 ['25', '120d'],
 ['26', '20d'],
 ['30', '120d'],
 ['31', '10d'],
 ['31', '20d'],
 ['31', '60d'],
 ['31', '120d'],
 ['36', '20d'],
 ['40', '20d'],
 ['41', '120d'],
 ['46', '20d'],
 ['54', '10d'],
 ['54', '20d'],
 ['56', '60d'],
 ['60', '10d'],
 ['71', '60d'],
 ['73', '20d'],
 ['73', '120d'],
 ['77', '20d'],
 ['77', '120d'],
 ['80', '120d'],
 ['90', '60d'],
 ['97', '10d'],
 ['98', '10d'],
 ['98', '20d'],
 ['103', '60d'],
 ['106', '120d'],
 ['113', '20d'],
 ['113', '60d'],
 ['113', '120d'],
 ['116', '120d'],
 ['118', '10d'],
 ['118', '20d'],
 ['118', '120d'],
 ['125', '10d'],
 ['125', '60d'],
 ['130', '20d'],
 ['134', '20d'],
 ['134', '60d'],
 ['139', '60d'],
 ['141', '20d'],
 ['147', '60d'],
 ['153', '60d'],
 ['154', '60d'],
 ['160', '20d'],
 ['160', '60d'],
 ['160', '120d'],
 ['161', '60d'],
 ['161', '120d'],
 ['166', '60d'],
 ['166', '120d'],


In [21]:
redo_list

[['3', '20d'],
 ['11', '10d'],
 ['12', '60d'],
 ['19', '120d'],
 ['23', '10d'],
 ['23', '60d'],
 ['25', '10d'],
 ['25', '20d'],
 ['25', '60d'],
 ['25', '120d'],
 ['26', '20d'],
 ['30', '120d'],
 ['31', '10d'],
 ['31', '20d'],
 ['31', '60d'],
 ['31', '120d'],
 ['36', '20d'],
 ['40', '20d'],
 ['41', '120d'],
 ['46', '20d'],
 ['54', '10d'],
 ['54', '20d'],
 ['56', '60d'],
 ['60', '10d'],
 ['71', '60d'],
 ['73', '20d'],
 ['73', '120d'],
 ['77', '20d'],
 ['77', '120d'],
 ['80', '120d'],
 ['90', '60d'],
 ['97', '10d'],
 ['98', '10d'],
 ['98', '20d'],
 ['103', '60d'],
 ['106', '120d'],
 ['113', '20d'],
 ['113', '60d'],
 ['113', '120d'],
 ['116', '120d'],
 ['118', '10d'],
 ['118', '20d'],
 ['118', '120d'],
 ['125', '10d'],
 ['125', '60d'],
 ['130', '20d'],
 ['134', '20d'],
 ['134', '60d'],
 ['139', '60d'],
 ['141', '20d'],
 ['147', '60d'],
 ['153', '60d'],
 ['154', '60d'],
 ['160', '20d'],
 ['160', '60d'],
 ['160', '120d'],
 ['161', '60d'],
 ['161', '120d'],
 ['166', '60d'],
 ['166', '120d'],


## Testing

In [8]:

def plot_confusion_matrix(cm,
                          target_names,
                          model,
                          d,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    # plt.xlabel(f'Predicted label\naccuracy={accuracy}')
    plt.xlabel('Predicted label')
    plt.savefig(f'./models/performance/model_{model}_{d}d.png')
    plt.close()

In [26]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import csv

with open('performance.csv', 'w', newline='') as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(['model', 'd', 'acc', 'r1', 'r2', 'r3', 'p1', 'p2', 'p3', 'f1', 'f2', 'f3'])

with open('trend_params.json') as jsonfile:
    code_para_map = json.load(jsonfile)

with open('codes.txt', 'r') as f:
    codes = f.read().split()

ROI = {10: 0, 20: 0, 60:0, 120:0}

for i in range(1, 201):
  if i % 10 == 0:
    print(f'model {i}')
  for d in [10, 20, 60, 120]:
    
    model_path = f'./models/models/model_{i}_{d}d.ckpt'
    # create testing dataset
    test_set = StockDataset(f'./data/training_data/val_data_{i}_{d}d.csv', mode = 'test')
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

    # create model and load weights from checkpoint
    model = Classifier().to(device)
    model.load_state_dict(torch.load(model_path))

    model.eval() # set the model to evaluation mode
    with torch.no_grad():
      test_acc = 0.0
      predict = []
      label = []
      for data in test_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs) 
        _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability

        test_acc += (test_pred.cpu() == labels.cpu()).sum().item() 

        for y in test_pred.cpu().numpy():
          predict.append(y)

        for y in labels.cpu().numpy():
          label.append(y)

    hit_count = {0: 0, 1: 0, 2: 0}
    loss_count = 0
    for j, prediction in enumerate(predict):
      if label[j] == prediction:
        hit_count[prediction] += 1
      elif (label[j] == 0 and prediction == 2) or (label[j] == 2 and prediction == 0):
        loss_count += 1
   
    acc = (hit_count[0] + hit_count[1] + hit_count[2]) / (label.count(0) + label.count(1) + label.count(2))
    acc = round(acc, 4)
    
    r1 = round(hit_count[0]/label.count(0), 4)
    r2 = round(hit_count[1]/label.count(1), 4)
    r3 = round(hit_count[2]/label.count(2), 4)

    if predict.count(0) == 0:
      p1 = 0
    else:
      p1 = round(hit_count[0]/predict.count(0), 4)

    if predict.count(1) == 0:
      p2 = 0
    else:
      p2 = round(hit_count[1]/predict.count(1), 4)

    if predict.count(2) == 0:
      p3 = 0
    else:
      p3 = round(hit_count[2]/predict.count(2), 4)

    if hit_count[0] == 0:
      f1 = 0
    else:
      f1 = round(2 * p1 * r1 / (p1 + r1), 4)

    if hit_count[1] == 0:
      f2 = 0
    else:
      f2 = round(2 * p2 * r2 / (p2 + r2), 4)

    if hit_count[2] == 0:
      f3 = 0
    else:
      f3 = round(2 * p3 * r3 / (p3 + r3), 4)

    with open('performance.csv', 'a', newline='') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow([i, d, acc, r1, r2, r3, p1, p2, p3, f1, f2, f3])

    plot_confusion_matrix(confusion_matrix(label, predict),
                          ['Down', 'Stable' ,'Up'],
                          i,
                          d,
                          normalize    = False
                         )
    gain = code_para_map[codes[i-1]][f'{d}d'][0] - 1
    ROI[d] += gain * (hit_count[0] + hit_count[2] - loss_count) / len(predict)

model 10
model 20
model 30
model 40
model 50
model 60
model 70
model 80
model 90
model 100
model 110
model 120
model 130
model 140
model 150
model 160
model 170
model 180
model 190
model 200


In [30]:
ROI

{10: 0.8666353317275792,
 20: 1.565148117865777,
 60: 4.1748055613749795,
 120: 8.56690323132078}

In [31]:
for d in ROI:
    ROI[d] /= 200
ROI

{10: 0.004333176658637896,
 20: 0.007825740589328884,
 60: 0.020874027806874897,
 120: 0.0428345161566039}

In [60]:
ROI

{15: 0.16074503155365555,
 30: 0.1415036395974974,
 90: 0.9053511935807522,
 180: 1.5318047851025869}

In [61]:
for d in [15, 30, 90, 180]:
    ROI[d] /= 200

ROI

{15: 0.0008037251577682778,
 30: 0.0007075181979874869,
 90: 0.004526755967903761,
 180: 0.007659023925512935}