# CNN Model for price movement

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import tensorflow as tf

import keras.utils
from keras.models import Sequential
from keras.layers import Convolution1D, Conv1D, ZeroPadding1D, MaxPooling1D, BatchNormalization, Activation, Dropout, Flatten, Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam

from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import classification_report

In [19]:
def rename_datetime(data):
    data.rename(columns={'Unnamed: 0': 'datetime'},inplace=True)
    data['datetime']=pd.to_datetime(data['datetime'])
    data.set_index('datetime', inplace=True)
    return data
# feature selection
# create features
def create_features(data, g_lag, tv_lag, tw_lag):
    data['Change'] =data['Close'].diff().dropna()
    data['Label'] = np.where(data['Change']>0, 1 ,0)
    data['google_trends_lag']=data['google_trends'].shift(g_lag)
    data['tweet_volume_lag']=data['tweet_volume'].shift(tv_lag)
    data['tw_polarity_lag'] = data['tw_polarity'].shift(tw_lag)

    data.drop(columns=['Open','High','Low','Change'],inplace=True)
    # Add features like RSI? Moving average?

    data.dropna(inplace=True)
    return data
# keep the wanted features
def keep_features(feature_conditions):
    features=['Label','Close']
    for feature, condition in feature_conditions.items():
        if condition:
            features.append(feature)
    return features

# create the lagged features based on the timesteps
def reshape_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    columns, names = list(), list()

    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]

    #put it all together
    final = pd.concat(columns, axis=1)
    final.columns = names

    #drop rows with NaN values
    final.dropna(inplace=True)

    new_data = final.reset_index()

    new_data = new_data.drop(columns=['datetime'])

    return new_data

# shuffle the data
def shuffle_data(times, data):
    np.random.seed(1)
    for i in range(times+1):
        data=shuffle(data)
    return data

# split labels from data
def split_label(train, test):
    train_y = train['var1(t)'].values
    test_y = test['var1(t)'].values
    train_y = train_y.reshape(len(train_y), 1)
    test_y = test_y.reshape(len(test_y), 1)
    return train_y, test_y

# normalize data using Minmaxscaler
def normalize_reshape_data(train, test, train_y, test_y, all_features, n_features, timestep):
    feature_scaler=MinMaxScaler()
    scale_train_data=feature_scaler.fit_transform(train)
    scale_test_data= feature_scaler.transform(test)
    train = scale_train_data[:, :all_features]
    test = scale_test_data[:, :all_features]
    #keep only prices array
    train_X, train_y = train[:, :all_features], train_y[:, -1]
    test_X, test_y = test[:, :all_features], test_y[:, -1]
    # reshape the data
    train_X = train_X.reshape((train_X.shape[0], timestep, n_features))
    test_X = test_X.reshape((test_X.shape[0], timestep, n_features))
    #set labels for training data to categorical
    train_y = to_categorical(train_y, 2)

    return train_X, test_X, train_y, test_y

In [20]:
filepath='./../data/processed_data.csv'

In [21]:
def get_data(filepath, g_lag, tv_lag, tw_lag, timestep, shuffle_times, split_ratio, feature_conditions):
    # import data
    # import the original data. processed_data
    # processed_data: weighted reddit score+ fill the nan data
    data=pd.read_csv(filepath)
    # show the data
    data=rename_datetime(data)

    # create features
    data_created = create_features(data,g_lag,tv_lag,tw_lag)

    # keep the wanted features
    features = keep_features(feature_conditions)
    data=data_created[features]

    # reshape the data
    # create the lagged features based on the timesteps
    df_copy = data.copy()
    new_data=reshape_features(df_copy, timestep, 1)

    # shuffle the data
    shuffled_data=shuffle_data(shuffle_times, new_data)

    # split the data
    train, test= train_test_split(shuffled_data, test_size=split_ratio)
    train_y, test_y=split_label(train, test)

    # normalized the data using MinMaxscaler
    n_features=len(features)
    all_features = timestep * n_features
    if (all_features==0):
        all_features=n_features
    train_X, test_X, train_y, test_y =normalize_reshape_data(train, test, train_y, test_y, all_features, n_features,timestep)

    print("train_X Shape:", train_X.shape)
    print("train_y Shape:", train_y.shape)
    print("test_X Shape:", test_X.shape)
    print("test_y Shape:", test_y.shape)

    return data, train_X, test_X, train_y, test_y, n_features

In [22]:
# # get correlation matrix
# sns.heatmap(data.corr(), annot=True)
# plt.show()

## Model Building

In [23]:
def create_model(neurons, epochs, dropout, batch_size, verbose, layers,
                 activ_func, activ_dense,my_optimizer,
                 train_X, train_y, test_X, test_y, n_features, timestep ):
    #set seed to reproduce results
    np.random.seed(1)
    tf.random.set_seed(1)

    # design network
    model = Sequential()

    #return sequences flag if there are more than 1 layer
    return_seq = layers > 1

    #add first layer
    model.add(Conv1D(neurons, kernel_size=2, padding='same', input_shape=(timestep, n_features), activation=activ_func))
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Dropout(dropout))

    #add the other layers
    for i in range(1, layers):
        model.add(Conv1D(neurons, kernel_size=2, padding='same', activation=activ_func))
        model.add(MaxPooling1D(pool_size=2, padding='same'))
        model.add(Dropout(dropout))

    #flatten and add a dense layer and to output the prediction
    model.add(Flatten())
    model.add(Dense(2, activation=activ_dense))
    model.compile(loss='categorical_crossentropy', optimizer=my_optimizer, metrics=['accuracy'])

    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =50)

    # fit network
    history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose, shuffle=False,validation_split=0.2, callbacks=[callback])

    #reshape
    test_X = test_X.reshape((test_X.shape[0], timestep, n_features))

    #make prediction
    pred = model.predict(test_X)

    #reshape again
    test_X = test_X.reshape((test_X.shape[0], timestep* n_features,))

    #get prediction
    y_pred = np.argmax(pred, axis=1)

    # calculate the metrics
    report=classification_report(
          test_y,
          y_pred,target_names = ["Down", "Up"],
          digits = 5, output_dict=True)

    # precision = report['Down']['precision']
    down_f1_score = report['Down']['f1-score']
    up_f1_score = report['Up']['f1-score']
    accuracy=report['accuracy']

    return down_f1_score, up_f1_score, accuracy

In [24]:
# model parameters

feature_conditions = {
        'google_trends': 0, 'google_trends_lag': 0,
        'tweet_volume_lag': 0, 'tw_polarity_lag': 0, 'tw_compound': 0,
        'tw_polarity': 0, 'tweet_volume': 0,'re_compound': 0,'re_polarity': 0,
        're_subjectivity': 0
    }
def test_model(filepath_out, feature_conditions):
    columns = ["timestep","features","google_trends_lag","tweet_volume_lag","tweet_polarity_score_lag", "batch_size", "neurons", "layers", "mean_down_f1_score","mean_up_f1_score", "mean_acc","min_acc", "max_acc", "diff_acc"]

    try:
        results = pd.read_csv(filepath_out)
    except:
        results = pd.DataFrame(columns=columns)

    #lagged_features
    timestep = [10,15]
    #train_ratio
    split_ratio =0.2
    shuffle_times = 3
    activ_func = "linear"
    activ_dense = 'softmax'
    my_optimizer = 'adam'
    # my_optimizer=RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
    epochs = 10000
     #for each lag feature
    for step in timestep:
        neurons = [32]
        layers = [1]
        batch_sizes = [200]
        dropout = 0.25
        verbose=2
        # Lags
        g_lag = [10]
        tv_lag = [28] # tweets volume
        tw_lag = 15 # tweets score

        #for each epoch, neuron, layers and batch_size value
        for n in neurons:
            for l in layers:
                for b in  batch_sizes:
                    print("Testing model: lag:", timestep, ", neurons:", n, ", layers:", l, ", batch_size:", b)
                    for g in g_lag:
                        for tv in tv_lag:
                            #run for 5 times
                            accuracies = []
                            down_f1_score =[]
                            up_f1_score = []
                            for i in range (0,10):
                                data, train_X, test_X, train_y, test_y, n_features = get_data(filepath, g, tv, tw_lag, step, shuffle_times, split_ratio, feature_conditions)
                                down_score, up_score, accuracy = create_model(n, epochs, dropout, b, verbose, l, activ_func, activ_dense,my_optimizer, train_X, train_y, test_X, test_y, n_features, step)
                                accuracies.append(accuracy)
                                down_f1_score.append(down_score)
                                up_f1_score.append(up_score)

                            #calculate mean values
                            accuracies = np.array(accuracies)
                            mean_acc =accuracies.mean()
                            min_acc =accuracies.min()
                            max_acc =accuracies.max()
                            diff_acc = max_acc - min_acc
                            mean_down_f1_score= np.array(down_f1_score).mean()
                            mean_up_f1_score=np.array(up_f1_score).mean()

                            results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,'optimizer':my_optimizer}, ignore_index=True)
    return pd.DataFrame(results)

In [25]:
filepath_out='./../data/cnn_results_1.csv'
results=test_model(filepath_out, feature_conditions)

14/14 - 0s - loss: 0.6881 - accuracy: 0.5412 - val_loss: 0.6949 - val_accuracy: 0.5210 - 27ms/epoch - 2ms/step
Epoch 45/10000
14/14 - 0s - loss: 0.6866 - accuracy: 0.5481 - val_loss: 0.6950 - val_accuracy: 0.5225 - 28ms/epoch - 2ms/step
Epoch 46/10000
14/14 - 0s - loss: 0.6886 - accuracy: 0.5456 - val_loss: 0.6950 - val_accuracy: 0.5196 - 31ms/epoch - 2ms/step
Epoch 47/10000
14/14 - 0s - loss: 0.6878 - accuracy: 0.5488 - val_loss: 0.6949 - val_accuracy: 0.5210 - 28ms/epoch - 2ms/step
Epoch 48/10000
14/14 - 0s - loss: 0.6873 - accuracy: 0.5423 - val_loss: 0.6950 - val_accuracy: 0.5210 - 34ms/epoch - 2ms/step
Epoch 49/10000
14/14 - 0s - loss: 0.6873 - accuracy: 0.5354 - val_loss: 0.6950 - val_accuracy: 0.5196 - 29ms/epoch - 2ms/step
Epoch 50/10000
14/14 - 0s - loss: 0.6876 - accuracy: 0.5481 - val_loss: 0.6949 - val_accuracy: 0.5210 - 27ms/epoch - 2ms/step
Epoch 51/10000
14/14 - 0s - loss: 0.6883 - accuracy: 0.5459 - val_loss: 0.6950 - val_accuracy: 0.5152 - 26ms/epoch - 2ms/step
Epoch 5

  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,'optimizer':my_optimizer}, ignore_index=True)


14/14 - 0s - loss: 0.7081 - accuracy: 0.4956 - val_loss: 0.6918 - val_accuracy: 0.5189 - 206ms/epoch - 15ms/step
Epoch 2/10000
14/14 - 0s - loss: 0.7025 - accuracy: 0.5065 - val_loss: 0.6915 - val_accuracy: 0.5116 - 29ms/epoch - 2ms/step
Epoch 3/10000
14/14 - 0s - loss: 0.6989 - accuracy: 0.5087 - val_loss: 0.6913 - val_accuracy: 0.5276 - 30ms/epoch - 2ms/step
Epoch 4/10000
14/14 - 0s - loss: 0.6961 - accuracy: 0.5178 - val_loss: 0.6922 - val_accuracy: 0.5145 - 30ms/epoch - 2ms/step
Epoch 5/10000
14/14 - 0s - loss: 0.6915 - accuracy: 0.5309 - val_loss: 0.6929 - val_accuracy: 0.5131 - 28ms/epoch - 2ms/step
Epoch 6/10000
14/14 - 0s - loss: 0.6934 - accuracy: 0.5211 - val_loss: 0.6933 - val_accuracy: 0.5073 - 28ms/epoch - 2ms/step
Epoch 7/10000
14/14 - 0s - loss: 0.6927 - accuracy: 0.5338 - val_loss: 0.6937 - val_accuracy: 0.5073 - 29ms/epoch - 2ms/step
Epoch 8/10000
14/14 - 0s - loss: 0.6920 - accuracy: 0.5382 - val_loss: 0.6940 - val_accuracy: 0.5058 - 29ms/epoch - 2ms/step
Epoch 9/1000

  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,'optimizer':my_optimizer}, ignore_index=True)


Testing model: lag: [10, 15] , neurons: 32 , layers: 1 , batch_size: 200
train_X Shape: (3444, 10, 2)
train_y Shape: (3444, 2)
test_X Shape: (862, 10, 2)
test_y Shape: (862,)
Epoch 1/10000
14/14 - 0s - loss: 0.7217 - accuracy: 0.5016 - val_loss: 0.7068 - val_accuracy: 0.4761 - 225ms/epoch - 16ms/step
Epoch 2/10000
14/14 - 0s - loss: 0.7006 - accuracy: 0.5198 - val_loss: 0.7003 - val_accuracy: 0.4877 - 31ms/epoch - 2ms/step
Epoch 3/10000
14/14 - 0s - loss: 0.6977 - accuracy: 0.5107 - val_loss: 0.6980 - val_accuracy: 0.5007 - 28ms/epoch - 2ms/step
Epoch 4/10000
14/14 - 0s - loss: 0.6964 - accuracy: 0.5191 - val_loss: 0.6977 - val_accuracy: 0.4935 - 30ms/epoch - 2ms/step
Epoch 5/10000
14/14 - 0s - loss: 0.7030 - accuracy: 0.4947 - val_loss: 0.6966 - val_accuracy: 0.4978 - 29ms/epoch - 2ms/step
Epoch 6/10000
14/14 - 0s - loss: 0.6941 - accuracy: 0.5274 - val_loss: 0.6962 - val_accuracy: 0.4949 - 31ms/epoch - 2ms/step
Epoch 7/10000
14/14 - 0s - loss: 0.6980 - accuracy: 0.5129 - val_loss: 0.

  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,'optimizer':my_optimizer}, ignore_index=True)


14/14 - 0s - loss: 0.7084 - accuracy: 0.5211 - val_loss: 0.6971 - val_accuracy: 0.5131 - 243ms/epoch - 17ms/step
Epoch 2/10000
14/14 - 0s - loss: 0.7041 - accuracy: 0.5145 - val_loss: 0.6951 - val_accuracy: 0.5145 - 33ms/epoch - 2ms/step
Epoch 3/10000
14/14 - 0s - loss: 0.6989 - accuracy: 0.5105 - val_loss: 0.6933 - val_accuracy: 0.5160 - 31ms/epoch - 2ms/step
Epoch 4/10000
14/14 - 0s - loss: 0.6974 - accuracy: 0.5265 - val_loss: 0.6930 - val_accuracy: 0.5189 - 32ms/epoch - 2ms/step
Epoch 5/10000
14/14 - 0s - loss: 0.6968 - accuracy: 0.5240 - val_loss: 0.6929 - val_accuracy: 0.5174 - 31ms/epoch - 2ms/step
Epoch 6/10000
14/14 - 0s - loss: 0.6905 - accuracy: 0.5382 - val_loss: 0.6928 - val_accuracy: 0.5189 - 31ms/epoch - 2ms/step
Epoch 7/10000
14/14 - 0s - loss: 0.6921 - accuracy: 0.5247 - val_loss: 0.6928 - val_accuracy: 0.5218 - 31ms/epoch - 2ms/step
Epoch 8/10000
14/14 - 0s - loss: 0.6911 - accuracy: 0.5378 - val_loss: 0.6931 - val_accuracy: 0.5218 - 33ms/epoch - 2ms/step
Epoch 9/1000

  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_down_f1_score":mean_down_f1_score,"mean_up_f1_score":mean_up_f1_score, "mean_acc": mean_acc,"min_acc":min_acc, "max_acc":max_acc, "diff_acc": diff_acc,'optimizer':my_optimizer}, ignore_index=True)


In [26]:
results.to_csv(filepath_out, index=False)