In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import datetime
import numpy as np
import pandas as pd
from pprint import pprint


import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
aapl = pd.read_csv('/content/drive/My Drive/DeepLearning/AAPL.csv')
aapl.shape

(9950, 7)

In [None]:
aapl.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.513393,0.515625,0.513393,0.513393,0.405683,117258400.0
1,1980-12-15,0.488839,0.488839,0.486607,0.486607,0.384517,43971200.0
2,1980-12-16,0.453125,0.453125,0.450893,0.450893,0.356296,26432000.0
3,1980-12-17,0.462054,0.464286,0.462054,0.462054,0.365115,21610400.0
4,1980-12-18,0.475446,0.477679,0.475446,0.475446,0.375698,18362400.0


In [None]:
# aapl['Date'] =  pd.to_datetime(aapl['Date'])

In [None]:
aapl.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume       float64
dtype: object

In [None]:
# Counting null values
aapl.isna().values.sum()

6

In [None]:
# Removing NaN
aapl = aapl.dropna()
aapl.isna().values.sum()

0

In [None]:
aapl['Date'] = pd.to_datetime(aapl['Date']).astype(int)/ 10**9
aapl.dtypes

Date         float64
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume       float64
dtype: object

In [None]:
aapl.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,345427200.0,0.513393,0.515625,0.513393,0.513393,0.405683,117258400.0
1,345686400.0,0.488839,0.488839,0.486607,0.486607,0.384517,43971200.0
2,345772800.0,0.453125,0.453125,0.450893,0.450893,0.356296,26432000.0
3,345859200.0,0.462054,0.464286,0.462054,0.462054,0.365115,21610400.0
4,345945600.0,0.475446,0.477679,0.475446,0.475446,0.375698,18362400.0


In [None]:
def technical_indicators(dataset):

    adjClose = dataset['Adj Close']

    # Create 7 and 21 days Moving Average
    dataset['ma7'] = adjClose.rolling(window=7).mean()
    dataset['ma21'] = adjClose.rolling(window=21).mean()
    
    # Create MACD
    dataset['26ema'] = adjClose.ewm(span=26).mean()
    dataset['12ema'] = adjClose.ewm(span=12).mean()
    dataset['MACD'] = (dataset['12ema']-dataset['26ema'])

    # Create Bollinger Bands
    dataset['20sd'] = adjClose.rolling(window=20,center=False).std() 
    dataset['upper_band'] = dataset['ma21'] + (dataset['20sd']*2)
    dataset['lower_band'] = dataset['ma21'] - (dataset['20sd']*2)
    
    # Create Exponential moving average
    dataset['ema'] = adjClose.ewm(com=0.5).mean()
    
    # Create RSI
    delta = adjClose.diff()
    delta = delta[1:] 

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    # Calculate the SMA
    roll_up = up.rolling(14).mean()
    roll_down = down.abs().rolling(14).mean()

    # Calculate the RSI based on SMA
    RS = roll_up / roll_down
    dataset['RSI'] = 100.0 - (100.0 / (1.0 + RS))
    
    return dataset

In [None]:
aapl_t = technical_indicators(aapl)

In [None]:
# Remove the first 20 values in order to have no NaN
aapl_t = aapl_t[20:]
aapl_t.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ma7,ma21,26ema,12ema,MACD,20sd,upper_band,lower_band,ema,RSI
20,348192000.0,0.546875,0.546875,0.544643,0.544643,0.430377,5762400.0,0.445748,0.437684,0.445946,0.448352,0.002405,0.045701,0.529086,0.346283,0.435434,52.755916
21,348278400.0,0.546875,0.549107,0.546875,0.546875,0.432141,3572800.0,0.439448,0.438944,0.444693,0.445793,0.0011,0.043902,0.526749,0.35114,0.433239,49.152469
22,348364800.0,0.558036,0.5625,0.558036,0.558036,0.44096,3516800.0,0.437432,0.441632,0.44436,0.445033,0.000673,0.039051,0.519734,0.36353,0.438386,45.454405
23,348451200.0,0.555804,0.555804,0.553571,0.553571,0.437432,3348800.0,0.437684,0.445496,0.443751,0.443842,9.1e-05,0.034227,0.51395,0.377041,0.43775,29.545352
24,348710400.0,0.587054,0.589286,0.587054,0.587054,0.46389,10393600.0,0.442976,0.450199,0.445498,0.446975,0.001477,0.029583,0.509365,0.391034,0.455177,37.37387


In [None]:
SEQ_LEQ = 24
FORECAST_PERIOD = 1

In [None]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [None]:
aapl['Future'] = aapl['Adj Close'].shift(-FORECAST_PERIOD)

In [None]:
aapl['Label'] = list(map(classify, aapl['Adj Close'], aapl['Future']))

In [None]:
aapl.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Future,Label
0,345427200.0,0.513393,0.515625,0.513393,0.513393,0.405683,117258400.0,0.384517,0
1,345686400.0,0.488839,0.488839,0.486607,0.486607,0.384517,43971200.0,0.356296,0
2,345772800.0,0.453125,0.453125,0.450893,0.450893,0.356296,26432000.0,0.365115,1
3,345859200.0,0.462054,0.464286,0.462054,0.462054,0.365115,21610400.0,0.375698,1
4,345945600.0,0.475446,0.477679,0.475446,0.475446,0.375698,18362400.0,0.398628,1


In [None]:
aapl_s = aapl[['Date', 'Adj Close', 'Volume', 'Label']]
aapl_s.head()

Unnamed: 0,Date,Adj Close,Volume,Label
0,345427200.0,0.405683,117258400.0,0
1,345686400.0,0.384517,43971200.0,0
2,345772800.0,0.356296,26432000.0,1
3,345859200.0,0.365115,21610400.0,1
4,345945600.0,0.375698,18362400.0,1


### Creating Sequences

In [None]:
aapl_s = aapl_s.set_index(['Date'])

In [None]:
train_samples = int(aapl_s.shape[0] * 0.9)

train_aapl = aapl_s.iloc[:train_samples]
validation_aapl = aapl_s.iloc[train_samples:]

In [None]:
train_aapl.head()

Unnamed: 0_level_0,Adj Close,Volume,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
345427200.0,,117258400.0,0
345686400.0,-0.052174,43971200.0,0
345772800.0,-0.073393,26432000.0,1
345859200.0,0.024752,21610400.0,1
345945600.0,0.028985,18362400.0,1


In [None]:
import random
from collections import deque
from sklearn import preprocessing

In [None]:
def preprocess(df):

    # df = df.drop('Future', 1)

    # for col in df.columns: 
    #     if col != 'Label': 
    #         df[col] = df[col].pct_change() 
    #         df.dropna(inplace=True) 
    #         df[col] = preprocessing.scale(df[col].values)

    df.dropna(inplace=True)

    sequential_data = [] 
    prev_days = deque(maxlen=SEQ_LEQ)

    for i in df.values:
      prev_days.append([n for n in i[:-1]]) 
      if len(prev_days) == SEQ_LEQ: 
        sequential_data.append([np.array(prev_days), i[-1]]) 

    random.shuffle(sequential_data)

    ups = [] 
    downs = []  

    for seq, label in sequential_data: 
        if label == 1:  
            ups.append([seq, label]) 
        elif label == 0:
            downs.append([seq, label]) 

    random.shuffle(downs) 
    random.shuffle(ups)

    lower = min(len(downs), len(ups))

    ups = ups[:lower]  
    downs = downs[:lower]

    sequential_data = ups+downs  
    random.shuffle(sequential_data) 

    return sequential_data

In [None]:
train_sequences_1 = preprocess(train_aapl)
test_sequences_1 = preprocess(validation_aapl)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

device = "cuda"

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f18c54e14b0>

In [None]:
test_sequences_1[0][1]

1.0

In [None]:
test_sequences_1[0][0]

array([[ 7.21606416e-03,  4.05291000e+07],
       [ 1.96276705e-02,  3.27557000e+07],
       [ 5.05031315e-02,  4.87487000e+07],
       [-9.12728844e-03,  3.27886000e+07],
       [ 7.94574137e-03,  3.92813000e+07],
       [-1.35687243e-02,  5.38125000e+07],
       [-2.07567332e-02,  3.25038000e+07],
       [-3.09103579e-02,  4.52479000e+07],
       [ 2.88035392e-02,  2.92643000e+07],
       [-3.87539538e-03,  3.12036000e+07],
       [ 2.88696307e-02,  3.16272000e+07],
       [ 7.06799076e-04,  2.92719000e+07],
       [-1.62094443e-02,  2.80012000e+07],
       [ 3.28452817e-02,  3.43202000e+07],
       [ 2.10960812e-02,  4.54576000e+07],
       [-1.60993650e-02,  6.01542000e+07],
       [ 1.41487894e-02,  3.33920000e+07],
       [ 1.50088709e-02,  3.69378000e+07],
       [ 1.03172302e-02,  3.55834000e+07],
       [ 1.03449058e-02,  2.88038000e+07],
       [ 2.38017378e-02,  3.35120000e+07],
       [ 1.57353527e-02,  3.64866000e+07],
       [-1.14282273e-02,  4.05753000e+07],
       [-1.

### Convert to Tensor

In [None]:
def tensor_converter_e(sequence, label):

  label_tensor = torch.tensor(label).to(device)
  # label_tensor_zero = torch.zeros(1).to(device)
  seq_tensor = torch.tensor(sequence).to(device)

  return seq_tensor, label_tensor.long()
  # return seq_tensor.long(), label_tensor_zero.long()

In [None]:
train_sequences_tensor_labelLong = [tensor_converter_e(seq, label) for seq, label in train_sequences_1]
test_sequences_tensor_labelLong = [tensor_converter_e(seq, label) for seq, label in test_sequences_1]

#### Data structure

In [None]:
train_sequences_tensor_labelLong[0]

(tensor([[ 3.5329e-03,  2.4752e+07],
         [-2.8169e-02,  2.3324e+07],
         [-1.4492e-02,  1.3065e+07],
         [ 1.4705e-02,  2.1207e+07],
         [-2.1739e-02,  3.0562e+07],
         [-1.4814e-02,  1.9191e+07],
         [ 1.5037e-02,  1.6758e+07],
         [ 2.2222e-02,  2.0521e+07],
         [-1.4492e-02,  2.1885e+07],
         [-2.2059e-02,  2.7796e+07],
         [ 0.0000e+00,  4.4332e+07],
         [ 1.8797e-02,  1.0396e+07],
         [-1.4761e-02,  1.6310e+07],
         [-3.7446e-03,  2.0101e+07],
         [ 3.7587e-03,  1.6153e+07],
         [ 3.7466e-03,  1.3219e+07],
         [-7.4632e-03,  1.8701e+07],
         [-2.2556e-02,  3.4762e+07],
         [ 3.8454e-03,  2.6351e+07],
         [-1.5325e-02,  1.9673e+07],
         [-1.5565e-02,  3.3018e+07],
         [-1.9763e-02,  3.3502e+07],
         [ 5.4436e-02,  3.5563e+07],
         [-2.4856e-02,  2.7412e+07]], device='cuda:0', dtype=torch.float64),
 tensor(0, device='cuda:0'))

In [None]:
test_sequences_tensor_labelLong[0][1]

tensor(1, device='cuda:0')

##### General setup for models


In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

### Shallow LSTM

In [None]:
class ShallowLSTM(nn.Module):

  def __init__(self, hparams):
    super(ShallowLSTM, self).__init__()

    pprint(params)

    self.input_projection = nn.Sequential(
        nn.Linear(2, hparams.embedding_dim//2),
        nn.Tanh(),
        nn.Linear(hparams.embedding_dim//2, hparams.embedding_dim),
    )

    self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim, 
                        bidirectional=hparams.bidirectional,
                        num_layers=hparams.num_layers, 
                        dropout = hparams.dropout,
                        batch_first = True)

    lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2

    self.dropout = nn.Dropout(hparams.dropout)
    self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes)

    self.hidden_cell = (torch.zeros(1,1,hparams.hidden_dim),
                        torch.zeros(1,1,hparams.hidden_dim))
    
  def forward(self, x):

    embeddings = self.input_projection(x.float())
    o, (h, c) = self.lstm(embeddings)
    o = self.dropout(o[:, -1])
    output = self.classifier(o)
    
    return output

  def predict(self, x):
    logits = self.forward(x)
    predictions = torch.argmax(logits, dim=-1)
    return predictions

In [None]:
class HParams():

  hidden_dim = 64
  embedding_dim = 128
  num_classes = 2
  bidirectional = False
  num_layers = 1
  dropout = 0

params = HParams()

### Trainer Class

In [None]:
class Trainer():

  def __init__(
    self,
    model: nn.Module,
    loss_function,
    optimizer):

    self.model = model
    self.loss_function = loss_function
    self.optimizer = optimizer

  def train(self, train_dataset, 
            valid_dataset, 
            epochs):

    train_loss = 0.0
    for epoch in range(epochs):
      print(f'Epoch {epoch+1}')

      epoch_loss = 0.0
      self.model.train()

      for step, sentence in enumerate(train_dataset):

        tokens = sentence[0]
        labels = sentence[1]

        self.optimizer.zero_grad()

        predictions = self.model(tokens)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)
        
        temp_loss = self.loss_function(predictions, labels)

        temp_loss.backward()
        self.optimizer.step()

        epoch_loss += temp_loss.tolist()

        
      avg_epoch_loss = epoch_loss / len(train_dataset)
      train_loss += avg_epoch_loss

      print(f'\t[Epoch: {epoch+1}] Training Loss = {avg_epoch_loss}')
      # writer.add_scalar(' Training Loss', avg_epoch_loss, epoch)

      valid_loss = self.evaluate(valid_dataset)
      
      print(f'\t[Epoch: {epoch+1}] Validation Loss = {valid_loss}')
      # writer.add_scalar('Validation Loss', valid_loss, epoch)

    print('Training has finished')
    
    avg_epoch_loss = train_loss / epochs

    return avg_epoch_loss
  

  def evaluate(self, valid_dataset):

    valid_loss = 0.0
    self.model.eval()

    with torch.no_grad():
      for sentence in valid_dataset:
        tokens = sentence[0]
        labels = sentence[1]

        predictions = self.model(tokens)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)

        temp_loss = self.loss_function(predictions, labels)
        # temp_loss = temp_loss.view(tokens.shape[0], -1)
        # temp_loss = temp_loss.sum(dim=-1).mean()
        valid_loss += temp_loss.tolist()
      
    return valid_loss / len(valid_dataset)


  def predict(self, x):

    self.model.eval()
    
    with torch.no_grad():
        logits = self.model(x)
        predictions = torch.argmax(logits, -1)
        return logits, predictions

### Shallow Model Training

In [None]:
model = ShallowLSTM(params).cuda()
model

<__main__.HParams object at 0x7f18710266a0>


ShallowLSTM(
  (input_projection): Sequential(
    (0): Linear(in_features=2, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=128, bias=True)
  )
  (lstm): LSTM(128, 64, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (classifier): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
trainer = Trainer(
    model = model,
    loss_function = nn.CrossEntropyLoss(),
    optimizer = optim.Adam(model.parameters()),
)

In [None]:
train_dataset = DataLoader(train_sequences_tensor_labelLong, batch_size=64)
valid_dataset = DataLoader(test_sequences_tensor_labelLong, batch_size=64)

Loss 0.69 in binary classification means random!

In [None]:
trainer.train(train_dataset, valid_dataset, 12)

Epoch 1
	[Epoch: 1] Training Loss = 0.6950271368896874
	[Epoch: 1] Validation Loss = 0.6942437504019056
Epoch 2
	[Epoch: 2] Training Loss = 0.6938936375353458
	[Epoch: 2] Validation Loss = 0.6934896750109536
Epoch 3
	[Epoch: 3] Training Loss = 0.6936876730327188
	[Epoch: 3] Validation Loss = 0.6933210449559348
Epoch 4
	[Epoch: 4] Training Loss = 0.6935125854763672
	[Epoch: 4] Validation Loss = 0.6932495576994759
Epoch 5
	[Epoch: 5] Training Loss = 0.693386129654237
	[Epoch: 5] Validation Loss = 0.6932304331234523
Epoch 6
	[Epoch: 6] Training Loss = 0.6933236792139763
	[Epoch: 6] Validation Loss = 0.6932088945593152
Epoch 7
	[Epoch: 7] Training Loss = 0.6932845411509493
	[Epoch: 7] Validation Loss = 0.6931869855948857
Epoch 8
	[Epoch: 8] Training Loss = 0.6932590760453774
	[Epoch: 8] Validation Loss = 0.6931711435317993
Epoch 9
	[Epoch: 9] Training Loss = 0.6932417502368453
	[Epoch: 9] Validation Loss = 0.6931633736406054
Epoch 10
	[Epoch: 10] Training Loss = 0.6932306837861555
	[Epoch:

0.6935241799937547

### F-Score

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

import seaborn as sns

  import pandas.util.testing as tm


In [None]:

def f_score(labels_s, predictions_s):
  
  flat_labels_s = flat_list(labels_s)
  flat_predictions_s = flat_list(predictions_s)

  label_distribution = count(flat_labels_s)
  pred_distribution = count(flat_predictions_s)

  print(f'# instances: {len(flat_list(labels_s))}')

  keys = set(label_distribution.keys()) | set(pred_distribution.keys())
  for k in keys:
      print(f'\t# {k}: ({label_distribution.get(k, 0)}, {pred_distribution.get(k, 0)})')

  p = precision_score(flat_labels_s, flat_predictions_s, average='macro')
  r = recall_score(flat_labels_s, flat_predictions_s, average='macro')
  f = f1_score(flat_labels_s, flat_predictions_s, average='macro')

  print(f'# precision: {p:.4f}')
  print(f'# recall: {r:.4f}')
  print(f'# f1: {f:.4f}')
