# CNN-LSTM Model for price movement

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import tensorflow as tf

import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Convolution1D, Conv1D, ZeroPadding1D, MaxPooling1D, BatchNormalization, Activation, Dropout, Flatten, Dense
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam

from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import classification_report

In [2]:
def rename_datetime(data):
    data.rename(columns={'Unnamed: 0': 'datetime'},inplace=True)
    data['datetime']=pd.to_datetime(data['datetime'])
    data.set_index('datetime', inplace=True)
    return data
# feature selection
# create features
def create_features(data, g_lag, tv_lag, tw_lag):
    data['Change'] =data['Close'].diff().dropna()
    data['Label'] = np.where(data['Change']>0, 1 ,0)
    data['google_trends_lag']=data['google_trends'].shift(g_lag)
    data['tweet_volume_lag']=data['tweet_volume'].shift(tv_lag)
    data['tw_polarity_lag'] = data['tw_polarity'].shift(tw_lag)

    data.drop(columns=['Open','High','Low','Change'],inplace=True)
    # Add features like RSI? Moving average?

    data.dropna(inplace=True)
    return data
# keep the wanted features
def keep_features(feature_conditions):
    features=['Label','Close']
    for feature, condition in feature_conditions.items():
        if condition:
            features.append(feature)
    return features

# create the lagged features based on the timesteps
def reshape_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    columns, names = list(), list()

    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]

    #put it all together
    final = pd.concat(columns, axis=1)
    final.columns = names

    #drop rows with NaN values
    final.dropna(inplace=True)

    new_data = final.reset_index()

    new_data = new_data.drop(columns=['datetime'])

    return new_data

# shuffle the data
def shuffle_data(times, data):
    np.random.seed(1)
    for i in range(times+1):
        data=shuffle(data)
    return data

# split labels from data
def split_label(train, test):
    train_y = train['var1(t)'].values
    test_y = test['var1(t)'].values
    train_y = train_y.reshape(len(train_y), 1)
    test_y = test_y.reshape(len(test_y), 1)
    return train_y, test_y

# normalize data using Minmaxscaler
def normalize_reshape_data(train, test, train_y, test_y, all_features, n_features, timestep):
    feature_scaler=MinMaxScaler()
    scale_train_data=feature_scaler.fit_transform(train)
    scale_test_data= feature_scaler.transform(test)
    train = scale_train_data[:, :all_features]
    test = scale_test_data[:, :all_features]
    #keep only prices array
    train_X, train_y = train[:, :all_features], train_y[:, -1]
    test_X, test_y = test[:, :all_features], test_y[:, -1]
    # reshape the data
    train_X = train_X.reshape((train_X.shape[0], timestep, n_features))
    test_X = test_X.reshape((test_X.shape[0], timestep, n_features))
    #set labels for training data to categorical
    train_y = to_categorical(train_y, 2)

    return train_X, test_X, train_y, test_y

In [3]:
filepath='./../data/processed_data.csv'

In [4]:
def get_data(filepath, g_lag, tv_lag, tw_lag, timestep, shuffle_times, split_ratio, feature_conditions, month, flag):
    # import data
    # import the original data. processed_data
    # processed_data: weighted reddit score+ fill the nan data
    data=pd.read_csv(filepath)
    # show the data
    data=rename_datetime(data)

    # get the subdata set
    if flag==0:
        data=data[data.index.month==month]
    else:
        data=data[(data.index.month>=1) & (data.index.month <=month)]

    # create features
    data_created = create_features(data,g_lag,tv_lag,tw_lag)

    # keep the wanted features
    features = keep_features(feature_conditions)
    data=data_created[features]

    # reshape the data
    # create the lagged features based on the timesteps
    df_copy = data.copy()
    new_data=reshape_features(df_copy, timestep, 1)

    # shuffle the data
    shuffled_data=shuffle_data(shuffle_times, new_data)

    # split the data
    train, test= train_test_split(shuffled_data, test_size=split_ratio)
    train_y, test_y=split_label(train, test)

    # normalized the data using MinMaxscaler
    n_features=len(features)
    all_features = timestep * n_features
    if (all_features==0):
        all_features=n_features
    train_X, test_X, train_y, test_y =normalize_reshape_data(train, test, train_y, test_y, all_features, n_features,timestep)

    print("train_X Shape:", train_X.shape)
    print("train_y Shape:", train_y.shape)
    print("test_X Shape:", test_X.shape)
    print("test_y Shape:", test_y.shape)

    return data, train_X, test_X, train_y, test_y, n_features

In [5]:
# # get correlation matrix
# sns.heatmap(data.corr(), annot=True)
# plt.show()

## Model Building

In [6]:
def create_model(neurons, epochs, dropout, batch_size, verbose,
                 activ_func, activ_dense,my_optimizer,
                 train_X, train_y, test_X, test_y, n_features, timestep ):
    #set seed to reproduce results
    np.random.seed(1)
    tf.random.set_seed(1)

    # design network
    model = Sequential()

    #return sequences flag if there are more than 1 layer
    # return_seq = layers > 1

    model.add(Conv1D(neurons, kernel_size=64,padding='same', input_shape=(timestep, n_features)))
    # model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(BatchNormalization())
    # Add ReLU activation layer
    model.add(Activation(activ_func))
    model.add(Dropout(dropout))

    model.add(Conv1D(neurons, kernel_size=32, padding='same'))
    # model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation(activ_func))
    model.add(Dropout(dropout))

    model.add(Conv1D(neurons, kernel_size=32, padding='same'))
    # model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation(activ_func))
    model.add(Dropout(dropout))

    model.add(Conv1D(neurons, kernel_size=8, padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation(activ_func))
    model.add(Dropout(dropout))

    model.add(Conv1D(neurons, kernel_size=1, padding='same'))
    # model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation(activ_func))
    model.add(Dropout(dropout))

    model.add(LSTM(32, return_sequences=False))
    model.add(BatchNormalization())
    model.add(Activation(activ_func))
    model.add(Dropout(dropout))

    model.add(Dense(200))
    model.add(Dense(100))
    model.add(Dense(100))
    model.add(Dense(100))
    model.add(Dense(100))
    model.add(Dense(100))
    model.add(Dense(100))

    #add a dense layer to output the prediction
    model.add(Dense(2, activation=activ_dense))
    model.compile(loss='categorical_crossentropy', optimizer=my_optimizer, metrics=['accuracy'])

    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =50)

    # fit network
    history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose, shuffle=False,validation_split=0.2, callbacks=[callback])

    #reshape
    test_X = test_X.reshape((test_X.shape[0], timestep, n_features))

    #make prediction
    pred = model.predict(test_X)

    #reshape again
    test_X = test_X.reshape((test_X.shape[0], timestep* n_features,))

    #get prediction
    y_pred = np.argmax(pred, axis=1)

    # calculate the metrics
    report=classification_report(
          test_y,
          y_pred,target_names = ["Down", "Up"],
          digits = 5, output_dict=True)

    # precision = report['Down']['precision']
    down_f1_score = report['Down']['f1-score']
    up_f1_score = report['Up']['f1-score']
    accuracy=report['accuracy']

    return down_f1_score, up_f1_score, accuracy

In [7]:
# model parameters

feature_conditions = {
        'google_trends': 0, 'google_trends_lag': 0,
        'tweet_volume_lag': 0, 'tw_polarity_lag': 0, 'tw_compound': 0,
        'tw_polarity': 0, 'tweet_volume': 0,'re_compound': 0,'re_polarity': 0,
        're_subjectivity': 0
    }
def test_model(filepath_out, feature_conditions):
    columns = ["timestep","features","google_trends_lag","tweet_volume_lag","tweet_polarity_score_lag", "batch_size", "neurons", "layers", "mean_down_f1_score","mean_up_f1_score", "mean_acc","min_acc", "max_acc", "diff_acc","optimizer","month","consecutive"]

    try:
        results = pd.read_csv(filepath_out)
    except:
        results = pd.DataFrame(columns=columns)

    #lagged_features
    timestep = [10,15]
    #train_ratio
    split_ratio =0.2
    shuffle_times = 3
    activ_func = "relu"
    activ_dense = 'softmax'
    my_optimizer = 'adam'
    # my_optimizer=RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
    epochs = 10000
    months =[6]
     #for each lag feature
    for month in months:
        for step in timestep:
            neurons = [32]
            batch_sizes = [80]
            dropout = 0.25
            verbose=2
            # Lags
            g_lag = [2]
            tv_lag = [28] # tweets volume
            tw_lag = 15 # tweets score

            # set flag: 1 - consecutive months, flag: 0 - single month
            flag=1

            #for each epoch, neuron, layers and batch_size value
            for n in neurons:
                for b in  batch_sizes:
                    print("Testing model: lag:", timestep, ", neurons:", n,  ", batch_size:", b)
                    for g in g_lag:
                        for tv in tv_lag:
                            #run for 5 times
                            accuracies = []
                            down_f1_score =[]
                            up_f1_score = []
                            for i in range (0,5):
                                data, train_X, test_X, train_y, test_y, n_features = get_data(filepath, g, tv, tw_lag, step, shuffle_times, split_ratio, feature_conditions,month,flag)
                                down_score, up_score, accuracy = create_model(n, epochs, dropout, b, verbose, activ_func,activ_dense,my_optimizer, train_X, train_y, test_X, test_y, n_features, step)
                                accuracies.append(accuracy)
                                down_f1_score.append(down_score)
                                up_f1_score.append(up_score)

                                #calculate mean values
                            accuracies = np.array(accuracies)
                            mean_acc =accuracies.mean()
                            min_acc =accuracies.min()
                            max_acc =accuracies.max()
                            diff_acc = max_acc - min_acc
                            mean_down_f1_score= np.array(down_f1_score).mean()
                            mean_up_f1_score=np.array(up_f1_score).mean()

                            results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n,  "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,"optimizer":my_optimizer,"month":month,"consecutive": flag}, ignore_index=True)
    return pd.DataFrame(results)

In [8]:
filepath_out='./../data/cnnlstm_results_1.csv'
results=test_model(filepath_out, feature_conditions)

Testing model: lag: [10, 15] , neurons: 32 , batch_size: 80
train_X Shape: (3444, 10, 2)
train_y Shape: (3444, 2)
test_X Shape: (862, 10, 2)
test_y Shape: (862,)
Epoch 1/10000


2023-08-28 21:52:43.995867: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


35/35 - 2s - loss: 0.7320 - accuracy: 0.5136 - val_loss: 0.6931 - val_accuracy: 0.5109 - 2s/epoch - 52ms/step
Epoch 2/10000
35/35 - 0s - loss: 0.7000 - accuracy: 0.4966 - val_loss: 0.6929 - val_accuracy: 0.5109 - 383ms/epoch - 11ms/step
Epoch 3/10000
35/35 - 0s - loss: 0.7049 - accuracy: 0.4998 - val_loss: 0.6930 - val_accuracy: 0.5109 - 420ms/epoch - 12ms/step
Epoch 4/10000
35/35 - 0s - loss: 0.7006 - accuracy: 0.4940 - val_loss: 0.6930 - val_accuracy: 0.5109 - 423ms/epoch - 12ms/step
Epoch 5/10000
35/35 - 1s - loss: 0.7010 - accuracy: 0.4940 - val_loss: 0.6929 - val_accuracy: 0.5109 - 504ms/epoch - 14ms/step
Epoch 6/10000
35/35 - 0s - loss: 0.6968 - accuracy: 0.5016 - val_loss: 0.6934 - val_accuracy: 0.4891 - 395ms/epoch - 11ms/step
Epoch 7/10000
35/35 - 0s - loss: 0.6968 - accuracy: 0.5056 - val_loss: 0.6939 - val_accuracy: 0.4891 - 411ms/epoch - 12ms/step
Epoch 8/10000
35/35 - 0s - loss: 0.6957 - accuracy: 0.5201 - val_loss: 0.6942 - val_accuracy: 0.4891 - 379ms/epoch - 11ms/step
E

  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n,  "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,"optimizer":my_optimizer,"month":month,"consecutive": flag}, ignore_index=True)


Epoch 1/10000
35/35 - 2s - loss: 0.7262 - accuracy: 0.5102 - val_loss: 0.6931 - val_accuracy: 0.5044 - 2s/epoch - 49ms/step
Epoch 2/10000
35/35 - 0s - loss: 0.7031 - accuracy: 0.5145 - val_loss: 0.6934 - val_accuracy: 0.4956 - 385ms/epoch - 11ms/step
Epoch 3/10000
35/35 - 0s - loss: 0.6994 - accuracy: 0.4982 - val_loss: 0.6935 - val_accuracy: 0.4956 - 391ms/epoch - 11ms/step
Epoch 4/10000
35/35 - 0s - loss: 0.7012 - accuracy: 0.5011 - val_loss: 0.6941 - val_accuracy: 0.4956 - 410ms/epoch - 12ms/step
Epoch 5/10000
35/35 - 0s - loss: 0.6998 - accuracy: 0.5015 - val_loss: 0.6931 - val_accuracy: 0.5044 - 405ms/epoch - 12ms/step
Epoch 6/10000
35/35 - 0s - loss: 0.7015 - accuracy: 0.5004 - val_loss: 0.6932 - val_accuracy: 0.4985 - 401ms/epoch - 11ms/step
Epoch 7/10000
35/35 - 0s - loss: 0.6968 - accuracy: 0.5047 - val_loss: 0.6947 - val_accuracy: 0.4956 - 405ms/epoch - 12ms/step
Epoch 8/10000
35/35 - 0s - loss: 0.6935 - accuracy: 0.5269 - val_loss: 0.6939 - val_accuracy: 0.4956 - 414ms/epoch

  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n,  "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,"optimizer":my_optimizer,"month":month,"consecutive": flag}, ignore_index=True)


In [9]:
results.to_csv(filepath_out, index=False)