In [1]:
import yaml
import argparse
import time
import copy

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import pickle

In [2]:
from sklearn.linear_model import LinearRegression

class OLS():
    def __init__(self) -> None:
        super(OLS, self).__init__()
        self.ols = LinearRegression()


    def train(self, x_train, y_train):
        self.ols.fit(x_train, y_train)
        filename = 'trained_models/ols_model_mse.sav'
        pickle.dump(self.ols, open(filename, 'wb'))

    def set_model(self, model):
        self.ols = model

    def test(self, x_test):
        return self.ols.predict(x_test)


In [3]:
from sklearn.linear_model import Lasso

class LASSO():
    def __init__(self) -> None:
        super(LASSO, self).__init__()
        self.lasso = Lasso(alpha= 1)


    def train(self, x_train, y_train):
        self.lasso.fit(x_train, y_train)
        filename = 'trained_models/lasso_model_mse.sav'
        pickle.dump(self.lasso, open(filename, 'wb'))

    def set_model(self, model):
        self.lasso = model

    def test(self, x_test):
        return self.lasso.predict(x_test)


In [4]:
from sklearn.ensemble import RandomForestRegressor

class RandomForest:
    def __init__(self):
        #CART tree
        #num_tress_in_forest = 200
        #Consider depth of tree hyperparameter

        self.n_estimators = 50
        self.random_forest = RandomForestRegressor(n_estimators = self.n_estimators)

    def train(self, x_train, y_train):
        self.random_forest_fitted = self.random_forest.fit(x_train, y_train)
        filename = 'trained_models/random_forest_model_mse.sav'
        pickle.dump(self.random_forest_fitted, open(filename, 'wb'))

    def set_model(self, model):
        self.random_forest_fitted = model

    def test(self, x_test):
        y_predictions = self.random_forest_fitted.predict(x_test)
        return y_predictions
        

In [5]:
import torch
import torch.nn as nn

class DNN(nn.Module):
    def __init__ (self):
        super(DNN, self).__init__()

        #The deep neural network is initalized as mentioned in the paper
        #Use the Adam optimizer
        #Learning rate = 0.00005
        #Use batchnorm
        #Input layer dim = 484 (4 concatenated quarters), hidden layers dim = (100, 50, 33) 
        #Activation funciton = Exponential Linear Unit (ELU)
        #Batch size = 256
        #Epochs = 10

        self.LinearLayer1 = nn.Linear(484, 100)
        self.BN1 = nn.BatchNorm1d(100)
        self.LinearLayer2 = nn.Linear(100, 50)
        self.BN2 = nn.BatchNorm1d(50)
        self.LinearLayer3 = nn.Linear(50, 33)
        self.BN3 = nn.BatchNorm1d(33)
        self.LinearLayer4 = nn.Linear(33, 1)
        self.ELUActivation = nn.ELU()

    def forward(self, x):
        z1 = self.LinearLayer1(x)
        bn1 = self.BN1(z1)
        a1 = self.ELUActivation(bn1)

        z2 = self.LinearLayer2(a1)
        bn2 = self.BN2(z2)
        a2 = self.ELUActivation(bn2)

        z3 = self.LinearLayer3(a2)
        bn3 = self.BN3(z3)
        a3 = self.ELUActivation(bn3)

        z4 = self.LinearLayer4(a3)

        return z4

In [6]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, device):
        super(RNN, self).__init__()

        #input size = 121
        #Use Gated Recurrent Unit (GRU)
        #Initialize h and n to 0s
        #Hidden state dim = 20
        #Stacking GRU cells = 10?
        #Hidden state of the top most GRU is linked to a FCL
        #RMSProp optimizer
        #Learning rate = 0.001
        #Epochs = 5
        #Batch size = 128
        
        self.device = device
        self.input_size = 121
        self.num_layers = 4
        self.hidden_size = 20
        self.num_classes = 1
        self.rnn = nn.GRU(self.input_size, self.hidden_size, self.num_layers, batch_first = True)
        # x -> (batch_size, sequence_size, input_size)
        self.linear_layer = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, x):
        initial_hidden_state = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)

        out, _ = self.rnn(self, initial_hidden_state)
        # out: batch_size, sequence_length, hidden_size

        #out: (batch_size, hidden_size)
        out = out[:, -1, :]
        out = self.linear_layer(out)

        return out



In [9]:
def calculate_error_per_epsilon(losses, BHAR):
    data = pd.merge(losses, BHAR, how="outer", left_index=True, right_index = True)
    filt_000 = (data['BHAR'].abs() > 0) & (data['BHAR'].abs() <= 0.05)
    filt_005 = (data['BHAR'].abs() > 0.05) & (data['BHAR'].abs() <= 0.10)
    filt_010 = (data['BHAR'].abs() > 0.10) & (data['BHAR'].abs() <= 0.20)
    filt_020 = (data['BHAR'].abs() > 0.20) & (data['BHAR'].abs() <= 0.50)
    filt_050 = (data['BHAR'].abs() > 0.50) & (data['BHAR'].abs() <= 1)
    filt_100 = (data['BHAR'].abs() > 1)

    print(f'0 Error: {data.loc[filt_000].shape}\n 0.05 Error: {data.loc[filt_005].shape}\n 0.10 Error: {data.loc[filt_010].shape}\n 0.20 Error: {data.loc[filt_020].shape}\n 0.50 Error: {data.loc[filt_050].shape}\n 1.00 Error: {data.loc[filt_100].shape}\n')


    data_000 = data.loc[filt_000]["Loss"].mean()
    data_005 = data.loc[filt_005]["Loss"].mean()
    data_010 = data.loc[filt_010]["Loss"].mean()
    data_020 = data.loc[filt_020]["Loss"].mean()
    data_050 = data.loc[filt_050]["Loss"].mean()
    data_100 = data.loc[filt_100]["Loss"].mean()

    print(f'0 Error: {data_000}\n 0.05 Error: {data_005}\n 0.10 Error: {data_010}\n 0.20 Error: {data_020}\n 0.50 Error: {data_050}\n 1.00 Error: {data_100}\n')

def percentage_correct(outs, BHAR):
    data = pd.merge(outs, BHAR, how="outer", left_index=True, right_index = True)
    filt = ((data['BHAR'] > 0)  & (data['Out'] > 0) | (data['BHAR'] < 0)  & (data['Out'] < 0))
    filt = pd.DataFrame(filt, columns=['Percentage Correct'])
    print(f'Percentage_correct: {filt["Percentage Correct"].value_counts(normalize=True)}')
    return filt

In [10]:
parser = argparse.ArgumentParser(description='Machine Learning-Based Financial Statement Analysis')
parser.add_argument('--config', default='./configs/config.yaml')

class QuarterlyFundamentalData(Dataset):
    def __init__(self, filename):
        dataset = np.loadtxt(filename, delimiter=",")
        self.x = torch.from_numpy(dataset[:, :484]) # Skip the column that is the target
        self.y = torch.from_numpy(dataset[:, [484]]) # Size = (n_samples, 1)
        self.num_samples = dataset.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.num_samples

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target):
    """Computes the precision@k for the specified values of k"""
    batch_size = target.shape[0]

    _, pred = torch.max(output, dim=-1)

    correct = pred.eq(target).sum() * 1.0

    acc = correct / batch_size

    return acc

def ML_train(epoch, data_loader, model):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()
    

    for idx, (data, target) in enumerate(data_loader):
        start = time.time()
        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
        data = data.cpu().float().numpy()
        target = target.cpu().float().numpy()
        model.train(data, target)
        iter_time.update(time.time() - start)


def ML_validation(epoch, val_loader, model, criterion, percentage_correct_criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()
    all_losses = []
    all_outs = []

    for idx, (data, target) in enumerate(val_loader):
        start = time.time()
        data = data.cpu().float().numpy()
        target = target.float()

        out = torch.tensor(model.test(data))#.unsqueeze(1)
        loss = criterion(out, target)
        rmse = torch.sqrt(loss)
        medae = torch.median(nn.L1Loss(reduction='none')(target, out))

        all_losses.append(percentage_correct_criterion(out, target).squeeze(1).tolist())
        all_outs.append(out)
        batch_acc = accuracy(out, target)

        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)

        if idx % 10 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec @1 {top1.val:.4f} ({top1.avg:.4f})\t')
                  .format(epoch, idx, len(val_loader), iter_time=iter_time, loss=losses, top1=acc))
            
    print(f'RSME: ', rmse)
    print(f'MEDAE: ', medae)

    return all_losses, all_outs, losses.avg.tolist()
    

def train(epoch, data_loader, model, optimizer, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    for idx, (data, target) in enumerate(data_loader):
        start = time.time()
        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
        data = data.float()
        target = target.float()
        out = model.forward(data)
        loss = criterion(out, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_acc = accuracy(out, target)

        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)

        if idx % 10 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec @1 {top1.val:.4f} ({top1.avg:.4f})\t')
                  .format(epoch, idx, len(data_loader), iter_time=iter_time, loss=losses, top1=acc))

    return losses.avg.tolist()

def validate(epoch, val_loader, model, criterion, percentage_correct_criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()
    all_losses = []
    all_outs = []

    num_class = 1
    cm = torch.zeros(num_class, num_class)
    for idx, (data, target) in enumerate(val_loader):
        start = time.time()
        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
        data = data.float()
        target = target.float()

        with torch.no_grad():
            out = model.forward(data)
            loss = criterion(out, target)
            rmse = torch.sqrt(loss)
            medae = torch.median(nn.L1Loss(reduction='none')(target, out))
        all_losses.append(percentage_correct_criterion(out, target).squeeze(1).tolist())
        all_outs.append(out)
        batch_acc = accuracy(out, target)

        

        # update confusion matrix
        _, preds = torch.max(out, 1)
        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)
        if idx % 10 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec @1 {top1.val:.4f} ({top1.avg:.4f})\t')
                  .format(epoch, idx, len(val_loader), iter_time=iter_time, loss=losses, top1=acc))

    print(f'RSME: ', rmse)   
    print(f'MEDAE: ', medae)
    
    return all_losses, all_outs, losses.avg.tolist()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_epochs = 1
batch_size = 4096
learning_rate = 0.00005
train_set_filename = 'data/batched_train_data.csv'
val_set_filename = 'data/batched_val_data.csv'
model = OLS()#.to(device)
#model = DNN().to(device)
#dataset = np.loadtxt('/content/drive/MyDrive/Machine-Learning-Based-Financial-Statement-Analysis/data/batched_train_data.csv', delimiter=",")
#data = torch.from_numpy(dataset[:, :484]) 
#target = torch.from_numpy(dataset[:, [484]]) 



val_dataset = np.loadtxt('data/batched_val_data.csv', delimiter=",")
#val_data = torch.from_numpy(val_dataset[:, :484]) 
val_target = torch.from_numpy(val_dataset[:, [484]]) 

train_dataset = QuarterlyFundamentalData(train_set_filename)
data_loader = DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle=False, num_workers=2) # num_workers uses multiple subprocesses

val_dataset = QuarterlyFundamentalData(val_set_filename)
val_loader = DataLoader(dataset=val_dataset, batch_size= batch_size, shuffle=False, num_workers=2) # num_workers uses multiple subprocesses

criterion = nn.MSELoss()
percentage_correct_criterion = nn.MSELoss(reduction='none')
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

train_losses = []
val_losses = []
val_all_losses = []
val_percentage_correct = []
val_all_outs = []
#filename= '/content/drive/MyDrive/Machine-Learning-Based-Financial-Statement-Analysis/trained_models/ols_model_mse.sav'
#loaded_model = pickle.load(open(filename, 'rb'))
#model.set_model(loaded_model)
##result = loaded_model.score(X_test, Y_test)
#print(result)

for epoch in range(num_epochs):
    #Train the model
    ML_train(epoch, data_loader, model)
    val_all_losses, val_all_outs, tmp = ML_validation(epoch, val_loader, model, criterion, percentage_correct_criterion)
    val_losses.append(tmp)


    #train_losses.append(train(epoch, data_loader, model, optimizer, criterion))
    #val_all_losses, val_all_outs, tmp = validate(epoch, val_loader, model, criterion, percentage_correct_criterion)
    #val_losses.append(tmp)

#print(f'Train: {train_losses}')

print(f'Val: {val_losses}')



BHAR = pd.DataFrame(val_target.tolist(), columns=["BHAR"])
losses = [item for sublist in val_all_losses for item in sublist]
val_all_losses = pd.DataFrame(losses, columns=["Loss"])

val_all_outs = [item for sublist in val_all_outs for item in sublist]
val_all_outs = pd.DataFrame(val_all_outs, columns=["Out"])

calculate_error_per_epsilon(val_all_losses, BHAR)
percentage_correct(val_all_outs, BHAR)
"""
data_columns = pd.read_csv('data/train_data.csv', nrows=1).columns
data_columns = list(data_columns.drop(['tic', 'datadate', 'PRC', 'BHAR']))
print(f'All Columns: {type(data_columns)}, RF Features: {type(model.random_forest_fitted.feature_importances_.shape)}')
all_columns = data_columns + data_columns + data_columns + data_columns
sorted_idx = model.random_forest_fitted.feature_importances_.argsort()[-10:]

print(f'Sorted Index: {sorted_idx}, All Columns: {np.array(all_columns)[sorted_idx.astype(int)]}, RF Features: {np.array(model.random_forest_fitted.feature_importances_)[sorted_idx.astype(int)]}')
plt.barh(np.array(all_columns)[sorted_idx.astype(int)], np.array(model.random_forest_fitted.feature_importances_)[sorted_idx.astype(int)])

plt.title("Important Features")
plt.ylabel("Features")
plt.xlabel("Feature Importance")
#plt.legend(["Train", "Val"])
plt.show()

"""
plt.plot(train_losses)
plt.plot(val_losses)
plt.title("Losses")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend(["Train", "Val"])
plt.show()
