# LSTM Model for price prediction

In [143]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.preprocessing import StandardScaler
import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import LSTM
import math
from keras.optimizers import Adam

from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import classification_report

In [144]:
def rename_datetime(data):
    data.rename(columns={'Unnamed: 0': 'datetime'},inplace=True)
    data['datetime']=pd.to_datetime(data['datetime'])
    data.set_index('datetime', inplace=True)
    return data
# feature selection
# create features
def create_features(data, g_lag, tv_lag, tw_lag,v_lag):
    data['return'] =data['Close'].pct_change().dropna()
    data['google_trends_lag']=data['google_trends'].shift(g_lag)
    data['tweet_volume_lag']=data['tweet_volume'].shift(tv_lag)
    data['tw_polarity_lag'] = data['tw_polarity'].shift(tw_lag)
    data['volume_lag'] = data['Volume'].shift(v_lag)

    # data.drop(columns=['Open','High','Low'],inplace=True)
    # Add features like RSI? Moving average?

    data.dropna(inplace=True)
    return data

# keep the wanted features
def keep_features(feature_conditions):
    features=['return']
    for feature, condition in feature_conditions.items():
        if condition:
            features.append(feature)
    return features


def timestep_matrix(dataset,label, timestep):
    X_data, y_data=[], []
    for i in range(len(dataset)-timestep-1):
        feature=dataset[i:(i+timestep)]
        X_data.append(feature)
        y_data.append(label[i+timestep])
    y_data=np.array(y_data).reshape(len(y_data),1)
    return np.array(X_data), y_data

# normalize data using Minmaxscaler
def normalize_reshape_data(train, test,val,timestep):
    train_y=train['return'].to_numpy().reshape(-1,1)
    test_y=test['return'].to_numpy().reshape(-1,1)
    val_y=val['return'].to_numpy().reshape(-1,1)

    feature_scaler=MinMaxScaler()
    scale_train_data=feature_scaler.fit_transform(train)
    scale_test_data= feature_scaler.transform(test)
    scale_val_data=feature_scaler.transform(val)

    Label_scaler = MinMaxScaler()
    scaled_train_y=Label_scaler.fit_transform(train_y)
    scaled_test_y=Label_scaler.transform(test_y)
    scaler_val_y=Label_scaler.transform(val_y)

    train_X, train_y = timestep_matrix(scale_train_data,scaled_train_y, timestep)
    test_X, test_y = timestep_matrix(scale_test_data,scaled_test_y, timestep)
    val_X, val_y=timestep_matrix(scale_val_data,scaler_val_y, timestep)

    return train_X, test_X, train_y, test_y,val_X, val_y,Label_scaler

In [145]:
filepath='./../data/processed_data.csv'

In [146]:
def get_data(filepath, g_lag, tv_lag, tw_lag,v_lag, timestep, shuffle_times, split_ratio, feature_conditions, month, flag):
    # import data
    # import the original data. processed_data
    # processed_data: weighted reddit score+ fill the nan data
    data=pd.read_csv(filepath)
    # show the data
    data=rename_datetime(data)

    # get the subdata set
    if flag==0:
        data=data[data.index.month==month]
    else:
        data=data[(data.index.month>=1) & (data.index.month <=month)]

    # create features
    data_created = create_features(data,g_lag,tv_lag,tw_lag,v_lag)

    # keep the wanted features
    features = keep_features(feature_conditions)
    data=data_created[features]

    # split the data
    train_val_size=int(len(data)*split_ratio)
    train_size=int(train_val_size*split_ratio)
    val_size=train_val_size-train_size
    test_size=len(data)-train_val_size

    train ,val, test=data[0:train_size],data[train_size:train_val_size], data[train_val_size:len(data)]

    # normalized the data using MinMaxscaler
    n_features=len(features)

    train_X, test_X, train_y, test_y,val_X, val_y,Label_scaler =normalize_reshape_data(train, test, val, timestep)

    print("train_X Shape:", train_X.shape)
    print("train_y Shape:", train_y.shape)
    print("test_X Shape:", test_X.shape)
    print("test_y Shape:", test_y.shape)

    return data, train_X, test_X, train_y, test_y,val_X, val_y, n_features,Label_scaler

In [147]:
# # get correlation matrix
# sns.heatmap(data.corr(), annot=True)
# plt.show()

## Model Building

In [148]:
def create_model(neurons, epochs, dropout, batch_size, verbose, layers,
                 activ_func, activ_dense,my_optimizer,
                 train_X, train_y, test_X, test_y,val_X, val_y, n_features, timestep,Label_scaler ):
    #set seed to reproduce results
    np.random.seed(1)
    tf.random.set_seed(1)

    # design network
    model = Sequential()

    #return sequences flag if there are more than 1 layer
    return_seq = layers > 1

    #add first layer
    model.add(LSTM(neurons, return_sequences=return_seq, input_shape=(timestep, n_features), activation=activ_func))
    # model.add(Dropout(dropout))

    #add the other layers
    for i in range(1, layers):
        ret_seq = i != (layers-1)
        model.add(LSTM(neurons, return_sequences=ret_seq, activation=activ_func))
        model.add(Dropout(dropout))

    #add a dense layer to output the prediction
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer=my_optimizer)

    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =50)

    # fit network
    history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose,validation_data=(val_X, val_y),callbacks=[callback])

    #make prediction
    pred = model.predict(test_X)

    # Transform back to original form
    y_pred=Label_scaler.inverse_transform(pred)

    y_true=Label_scaler.inverse_transform(test_y.reshape(-1,1))

    RMSE=math.sqrt(mean_squared_error(y_true,y_pred))
    MAPE=np.mean(np.abs(y_true-y_pred)/np.abs(y_true))
    R2_score =r2_score(y_true, y_pred)

    return RMSE, MAPE, R2_score

In [149]:
# model parameters

feature_conditions = {
        'google_trends': 0, 'google_trends_lag': 0,
        'tweet_volume_lag': 1, 'tw_polarity_lag': 0, 'tw_compound': 0,
        'tw_polarity': 0, 'tweet_volume': 0,'re_compound': 0,'re_polarity': 0,
        're_subjectivity': 0, 'volume_lag':0
    }
def test_model(filepath_out, feature_conditions):
    columns = ["timestep","features","google_trends_lag","tweet_volume_lag","tweet_polarity_score_lag", "batch_size", "neurons", "layers", "mean_mape","mean_r2", "mean_rmse","min_rmse", "max_rmse", "diff_rmse","optimizer","month","consecutive","actic_func"]

    try:
        results = pd.read_csv(filepath_out)
    except:
        results = pd.DataFrame(columns=columns)

    #lagged_features
    timestep = [15]
    #train_ratio
    split_ratio =0.8
    shuffle_times = 3
    activ_func = "relu"
    activ_dense = 'sigmoid'
    my_optimizer = 'adam'
    # my_optimizer=RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
    epochs = 10000
    months =[6]
     #for each lag feature
    for month in tqdm(months,total=len(months)):
        for step in tqdm(timestep,total=len(timestep)):
            neurons = [32]
            layers = [1]
            batch_sizes = [80]
            dropout = 0.25
            verbose=0
            # Lags
            g_lag = [2]
            tv_lag = 31 # tweets volume
            tw_lag = 15 # tweets score
            v_lag= [3]

            # set flag: 1 - consecutive months, flag: 0 - single month
            flag=1

            #for each epoch, neuron, layers and batch_size value
            for n in neurons:
                for l in layers:
                    for b in  batch_sizes:
                        print("Testing model: lag:", step, ", neurons:", n, ", layers:", l, ", batch_size:", b)
                        for g in g_lag:
                            for v in v_lag:
                                #run for 5 times
                                rmse = []
                                mape =[]
                                r2 = []
                                for i in tqdm(range (0,5)):
                                    data, train_X, test_X, train_y, test_y,val_X, val_y, n_features,Label_scaler = get_data(filepath, g, tv_lag, tw_lag,v, step, shuffle_times, split_ratio, feature_conditions,month,flag)
                                    RMSE, MAPE, R2 = create_model(n, epochs, dropout, b, verbose, l, activ_func, activ_dense,my_optimizer, train_X, train_y, test_X, test_y, val_X, val_y, n_features, step,Label_scaler)
                                    rmse.append(RMSE)
                                    mape.append(MAPE)
                                    r2.append(R2)

                                #calculate mean values
                                rmse = np.array(rmse)
                                mean_rmse =rmse.mean()
                                min_rmse =rmse.min()
                                max_rmse =rmse.max()
                                diff_rmse = max_rmse - min_rmse
                                mean_mape= np.array(mape).mean()
                                mean_r2=np.array(r2).mean()

                                results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv_lag,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_mape":mean_mape,"mean_r2":mean_r2, "mean_rmse": mean_rmse,"min_rmse":min_rmse, "max_rmse":max_rmse, "diff_rmse": diff_rmse,"optimizer":my_optimizer,"month":month,"consecutive": flag,"actic_func":activ_func}, ignore_index=True)
    return pd.DataFrame(results)

In [150]:
filepath_out='./../data/lstm_returns_regression.csv'
results=test_model(filepath_out, feature_conditions)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

Testing model: lag: 15 , neurons: 32 , layers: 1 , batch_size: 80




  0%|          | 0/5 [00:00<?, ?it/s][A[A

train_X Shape: (2744, 15, 2)
train_y Shape: (2744, 1)
test_X Shape: (847, 15, 2)
test_y Shape: (847, 1)




 20%|██        | 1/5 [00:25<01:43, 25.96s/it][A[A

train_X Shape: (2744, 15, 2)
train_y Shape: (2744, 1)
test_X Shape: (847, 15, 2)
test_y Shape: (847, 1)




 40%|████      | 2/5 [00:55<01:24, 28.13s/it][A[A

train_X Shape: (2744, 15, 2)
train_y Shape: (2744, 1)
test_X Shape: (847, 15, 2)
test_y Shape: (847, 1)




 60%|██████    | 3/5 [01:21<00:53, 26.92s/it][A[A

train_X Shape: (2744, 15, 2)
train_y Shape: (2744, 1)
test_X Shape: (847, 15, 2)
test_y Shape: (847, 1)




 80%|████████  | 4/5 [01:37<00:22, 22.75s/it][A[A

train_X Shape: (2744, 15, 2)
train_y Shape: (2744, 1)
test_X Shape: (847, 15, 2)
test_y Shape: (847, 1)




100%|██████████| 5/5 [01:58<00:00, 23.79s/it][A[A
  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv_lag,"tweet_polarity_score_lag": tw_lag, "batch_size":b, "neurons":n, "layers":l, "mean_mape":mean_mape,"mean_r2":mean_r2, "mean_rmse": mean_rmse,"min_rmse":min_rmse, "max_rmse":max_rmse, "diff_rmse": diff_rmse,"optimizer":my_optimizer,"month":month,"consecutive": flag,"actic_func":activ_func}, ignore_index=True)

100%|██████████| 1/1 [01:58<00:00, 118.98s/it][A
100%|██████████| 1/1 [01:58<00:00, 118.98s/it]


In [151]:
results.to_csv(filepath_out, index=False)