Mount dataset:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r drive/MyDrive/ColabNotebooks/cnnPred/Dataset/ /content/

# **Import libraries:**

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scaledata
from sklearn.preprocessing import StandardScaler
from os.path import join
from os import listdir
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
# import os
import random
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D, Input
# from pathlib2 import Path
from tensorflow.keras import backend as K, callbacks
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredLogarithmicError, MSE, MAE

# **Declare constant:**

In [None]:
DATADIR = "/content/Dataset"
TRAIN_TEST_CUTOFF = '2016-04-21'
TRAIN_VALID_RATIO = 0.75

seq_len = 60
batch_size = 128
n_epochs = 20
n_features = 82

# **Data Preprocessing:**

In [None]:
from os.path import join
from os import listdir


data = {}

print("data we have: ")
for filename in listdir(DATADIR):
    if not filename.lower().endswith(".csv"):
        continue # read only the CSV files

    filepath = join(DATADIR, filename)
    X = pd.read_csv(filepath, index_col="Date", parse_dates=True)
    # basic preprocessing: get the name, the classification
    # Save the target variable as a column in dataframe for easier dropna()
    name = X["Name"][0]
    print(X["Name"][0], end=' | ')
    del X["Name"]
    cols = X.columns
    # The line of code above is to compute the percentage change of the closing 
    # index and align the data with the previous day. Then convert the data into 
    # either 1 or 0 for whether the percentage change is positive.
    X["Target"] = (X["Close"].pct_change().shift(-1) > 0).astype(int) 
    X.dropna(inplace=True)
    # Fit the standard scaler using the training dataset
    index = X.index[X.index > TRAIN_TEST_CUTOFF]
    index = index[:int(len(index) * TRAIN_VALID_RATIO)]
    scaler = StandardScaler().fit(X.loc[index, cols])
    # Save scale transformed dataframe
    X[cols] = scaler.transform(X[cols])
    data[name] = X

data we have: 
NYA | S&P | RUT | NASDAQ | DJI | 

In [None]:
data['NYA']

Unnamed: 0_level_0,Close,Volume,mom,mom1,mom2,mom3,ROC_5,ROC_10,ROC_15,ROC_20,...,silver-F,RUSSELL-F,S&P-F,CHF,Dollar index-F,Dollar index,wheat-F,XAG,XAU,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-10-19,-8.770014,1.232144,-3.521385,0.921290,-0.543432,-0.366935,-0.948677,-0.511275,0.260829,0.537023,...,-1.687757,-1.957717,-2.512805,2.946985,4.084704,4.087215,-1.723464,-3.030445,-3.179746,1
2010-10-20,-8.518337,-0.586797,2.083192,-3.308400,0.852650,-0.585733,-0.665580,0.158863,1.015944,1.432813,...,0.690639,0.791223,1.511489,-2.225994,-3.141538,-3.203745,1.762781,1.802923,1.136816,0
2010-10-21,-8.538791,-0.475087,-0.394430,2.015788,-2.746574,0.924213,-0.599892,0.279328,1.088429,1.859468,...,-2.410474,-0.702541,-0.064847,1.517609,0.704292,0.830085,-2.072089,-2.384037,-1.935332,1
2010-10-26,-8.500773,-0.092545,-0.561486,0.340683,0.062240,-0.234660,0.772057,-0.110089,0.136768,0.804475,...,1.603891,-0.391340,-0.231754,2.720100,1.953575,2.007663,2.535224,0.634982,0.022028,0
2010-10-27,-8.626234,0.086975,-1.337634,-0.496580,0.358587,0.069337,-0.719090,-1.047763,-0.290349,0.553612,...,-1.937423,-0.422460,-0.862288,1.131904,1.488156,1.456456,0.457148,-0.885546,-1.339050,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-09,2.551948,-0.126560,-0.502278,-0.672391,-0.037127,0.278825,-0.605010,-0.600994,-0.555871,-0.211174,...,1.919258,-1.044862,-0.213208,-2.430191,-0.226546,-0.247273,0.450312,1.949834,1.499770,0
2017-08-10,2.155284,0.405858,-2.460125,-0.440334,-0.503481,-0.038135,-1.441596,-1.358462,-1.223169,-0.919267,...,0.802332,-2.196304,-2.902253,-0.184029,-0.324528,-0.372548,-2.858203,0.899421,1.162741,0
2017-08-16,2.399676,-0.001186,0.153807,-0.334502,1.014468,-0.179659,-0.672641,-0.964319,-0.904022,-0.914748,...,0.808902,-0.080139,0.046423,-1.522651,-0.814443,-0.798480,-1.668778,1.905761,1.097928,0
2017-08-17,2.007334,0.251304,-2.446978,0.182927,-0.215957,1.099229,-0.666869,-1.612134,-1.543241,-1.547507,...,0.454114,-2.227424,-3.087704,-0.637799,0.263369,0.253824,-0.814306,-0.209756,0.631273,0


# **Trainset genarator:**

In [None]:
def datagen(data, seq_len, batch_size, targetcol, kind):
    "As a generator to produce samples for Keras model"
    batch = []
    while True:
        # Pick one dataframe from the pool
        key = random.choice(list(data.keys()))
        df = data[key]
        input_cols = [c for c in df.columns if c != targetcol]
        index = df.index[df.index < TRAIN_TEST_CUTOFF]
        split = int(len(index) * TRAIN_VALID_RATIO)
        assert split > seq_len, "Training data too small for sequence length {}".format(seq_len)
        if kind == 'train':
            index = index[:split]   # range for the training set
        elif kind == 'valid':
            index = index[split:]   # range for the validation set
        else:
            raise NotImplementedError
        # Pick one position, then clip a sequence length
        while True:
            t = random.choice(index)     # pick one time step
            n = (df.index == t).argmax() # find its position in the dataframe
            if n-seq_len+1 < 0:
                continue # this sample is not enough for one sequence length
            frame = df.iloc[n-seq_len+1:n+1]
            batch.append([frame[input_cols].values, df.loc[t, targetcol]])
            break
        # if we get enough for a batch, dispatch
        if len(batch) == batch_size:
            X, y = zip(*batch)
            X, y = np.expand_dims(np.array(X), 3), np.array(y)
            yield X, y
            batch = []

# **Metrics:**

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
 
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
 
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
 
def f1macro(y_true, y_pred):
    f_pos = f1_m(y_true, y_pred)
    # negative version of the data and prediction
    f_neg = f1_m(1-y_true, 1-K.clip(y_pred,0,1))
    return (f_pos + f_neg)/2

# **Define Model:**

In [None]:
def cnnpred_2d(seq_len=60, n_features=82, n_filters=(8,8,8), droprate=0.1):
    "2D-CNNpred model according to the paper"
    model = Sequential([
        Input(shape=(seq_len, n_features, 1)),
        Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"),
        Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Flatten(),
        Dropout(droprate),
        Dense(1, activation="sigmoid")
    ])
    return model

In [None]:
def cnnpred_2d_mine(seq_len=60, n_features=82, n_filters=(8,8,8), droprate=0.1):
    "2D-CNNpred My own model architecture"
    model = Sequential([
        Input(shape=(seq_len, n_features, 1)),
        Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"),
        Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Flatten(),
        Dropout(droprate),
        Dense(1, activation="sigmoid")
    ])
    return model

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPooling2D, BatchNormalization, Activation
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.regularizers import Regularizer, L2

def cnnpred_2d_resnet(seq_len=60, n_features=82, n_filters=(8,8,8), droprate=0.1):
    model = Sequential()
    model.add(ResNet50(include_top=False, weights=None, input_shape=(seq_len, n_features, 1)))
    model.add(GlobalMaxPooling2D())
    model.add(Dense(512, activation='relu', kernel_regularizer=L2(l2=0.015), kernel_constraint=MaxNorm(2)))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu', kernel_regularizer=L2(l2=0.01), kernel_constraint=MaxNorm(2)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    return model

# **Fit and Train**

In [None]:
# Produce CNNpred as a binary classification problem
model = cnnpred_2d_mine(seq_len, n_features)

# loss = MSE(
#     reduction="auto", name="mean_squared_logarithmic_error"
# )
opt = Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss='mae', metrics=["acc", f1macro])
model.summary()  # print model structure to console

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 60, 1, 8)          664       
                                                                 
 conv2d_7 (Conv2D)           (None, 58, 1, 8)          200       
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 29, 1, 8)         0         
 2D)                                                             
                                                                 
 conv2d_8 (Conv2D)           (None, 27, 1, 8)          200       
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 13, 1, 8)         0         
 2D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 104)              

In [None]:
# Set up callbacks and fit the model
# We use custom validation score f1macro() and hence monitor for "val_f1macro"
checkpoint_path = "./cp2d-{epoch}-{val_f1macro:.2f}.h5"
callbacks = [
    ModelCheckpoint(checkpoint_path,
                    monitor='val_f1macro', mode="max",
                    verbose=0, save_best_only=True, save_weights_only=False, save_freq="epoch")
]
model.fit(datagen(data, seq_len, batch_size, "Target", "train"),
          validation_data=datagen(data, seq_len, batch_size, "Target", "valid"),
          epochs=n_epochs, steps_per_epoch=400, validation_steps=10, verbose=1, callbacks=callbacks)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9f60890b90>

# **Test:**

In [None]:
def testgen(data, seq_len, targetcol):
    "Return array of all test samples"
    batch = []
    for key, df in data.items():
        input_cols = [c for c in df.columns if c != targetcol]
        # find the start of test sample
        t = df.index[df.index >= TRAIN_TEST_CUTOFF][0]
        n = (df.index == t).argmax()
        for i in range(n+1, len(df)+1):
            frame = df.iloc[i-seq_len:i]
            batch.append([frame[input_cols].values, frame[targetcol][-1]])
    X, y = zip(*batch)
    return np.expand_dims(np.array(X),3), np.array(y)

In [None]:
# Prepare test data
test_data, test_target = testgen(data, seq_len, "Target")
 
# Test the model
test_out = model.predict(test_data)
test_pred = (test_out > 0.5).astype(int)
print("accuracy:", accuracy_score(test_pred, test_target))
print("MAE:", mean_absolute_error(test_pred, test_target))
print("F1:", f1_score(test_pred, test_target))

accuracy: 0.5434146341463415
MAE: 0.45658536585365855
F1: 0.7
