In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.layers import Dropout
from keras.engine.sequential import Sequential
from keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam, SGD

In [None]:
data_path = 'data/30MIN_Full_Dataset.csv'

data = pd.read_csv(data_path)
data = data.drop(['timestamp'], axis=1)[5:]

data.head()

In [None]:
'''Price Data:'''
# data_features = data.drop(['price_close', 'MACD', 'SMA', 'OBV', 'RSI', 'MFI', 'vader_pos', 'vader_neg', 'vader_neu', 'vader_compound', 'pos_count', 'neg_count', 'neu_count'], axis=1)

'''Price Data + Technical Indicator:'''
# data_features = data.drop(['price_close', 'vader_pos', 'vader_neg', 'vader_neu', 'vader_compound', 'pos_count', 'neg_count', 'neu_count'], axis=1)

'''Price Data + Sentiment Indicator:'''
# data_features = data.drop(['price_close', 'MACD', 'SMA', 'OBV', 'RSI', 'MFI', 'pos_count', 'neg_count', 'neu_count'], axis=1)

'''Price Data + Technical Indicator + Sentiment Indicator:'''
# data_features = data.drop(['price_close', 'pos_count', 'neg_count', 'neu_count'], axis=1)

'''Price Data + Technical Indicator in 30 MIN data:'''
# data_features = data.drop(['price_close', 'vader_pos', 'vader_neg', 'vader_neu', 'vader_compound', 'pos_count', 'neg_count', 'neu_count', 'pos_keywords_occur', 'neg_keywords_occur'], axis=1)

'''Price Data + Selected Features'''
data_features = data.drop(['price_close', 'MFI', 'vader_compound',	'pos_count', 'neg_count', 'neu_count'], axis=1)

data_labels = data['price_close']

scaler_feature = MinMaxScaler(feature_range=(0, 1))
scaler_label = MinMaxScaler(feature_range=(0, 1))

feature = scaler_feature.fit_transform(data_features.to_numpy())
label = scaler_label.fit_transform(data_labels.to_numpy().reshape(-1, 1))

In [None]:
def split_data(f, l, delay):
    feature, label = [], []

    for i in range(len(f) - delay):
        feature.append(f[i: i + delay])
        label.append(l[i + delay-1])
        
    return np.array(feature), np.array(label).squeeze()

window_size = 5

feature, label = split_data(feature, label, window_size)

train_size = int(0.8 * len(label))

X_train, X_test = feature[:train_size], feature[train_size:]
y_train, y_test = label[:train_size], label[train_size:]

print(X_train.shape, X_test.shape)

In [None]:
# split train X data into 5 datasets
newarr = np.array_split(X_train, 5)
X_train_1 = newarr[0] 
X_train_2 = newarr[1]
X_train_3 = newarr[2]
X_train_4 = newarr[3]
X_train_5 = newarr[4]

# split train y data into 5 datasets
newarr_y = np.array_split(y_train, 5)
y_train_1 = newarr_y[0] 
y_train_2 = newarr_y[1]
y_train_3 = newarr_y[2]
y_train_4 = newarr_y[3]
y_train_5 = newarr_y[4]

In [None]:
# Train dataset 1 （1-4）
X_train_dataset_1 = X_train_1+X_train_2+X_train_3+X_train_4

# Train dataset 2 （1-3，5）
X_train_dataset_2 = X_train_1+X_train_2+X_train_3+X_train_5

# Train dataset 3 （1，2，4，5）
X_train_dataset_3 = X_train_1+X_train_2+X_train_4+X_train_5

# Train dataset 4 （1，3-5）
X_train_dataset_4 = X_train_1+X_train_3+X_train_4+X_train_5

# Train dataset 5 （2-5）
X_train_dataset_5 = X_train_2+X_train_3+X_train_4+X_train_5

In [None]:
# Test dataset 1 （1-4）
y_train_dataset_1 = y_train_1+y_train_2+y_train_3+y_train_4

# Test dataset 2 （1-3，5）
y_train_dataset_2 = y_train_1+y_train_2+y_train_3+y_train_5

# Test dataset 3 （1，2，4，5）
y_train_dataset_3 = y_train_1+y_train_2+y_train_4+y_train_5

# Test dataset 4 （1，3-5）
y_train_dataset_4 = y_train_1+y_train_3+y_train_4+y_train_5

# Test dataset 5 （2-5）
y_train_dataset_5 = y_train_2+y_train_3+y_train_4+y_train_5

In [None]:
X_train_datasets = [X_train_dataset_1,X_train_dataset_2,X_train_dataset_3,X_train_dataset_4,X_train_dataset_5]
y_train_datasets = [y_train_dataset_1,y_train_dataset_2,y_train_dataset_3,y_train_dataset_4,y_train_dataset_5]

# Base Models

In [None]:
nGRU = 5
nLSTM = 5

In [None]:
GRUs = [
    keras.Sequential([
        layers.GRU(2048),
        layers.Dense(1024),
        layers.Dense(1)
    ]) for i in range(nGRU)
]


LSTMs = [
    keras.Sequential([
        layers.SimpleRNN(512),
        layers.Dense(512),
        layers.Dense(1)
    ]) for i in range(nLSTM)
]

In [None]:
for model, X, y in zip(GRUs, X_train_datasets, y_train_datasets):
    model.compile(loss='mean_squared_error',
                optimizer=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.00005),
                metrics=[keras.metrics.RootMeanSquaredError()]
                )
    history = model.fit(X, y,
                        epochs=30,
                        batch_size=4,
                        shuffle=True
                        )


In [None]:
for model, X, y in zip(LSTMs, X_train_datasets, y_train_datasets):
    model.compile(loss='mse', optimizer=keras.optimizers.Adam(learning_rate=1e-3), metrics=[tf.keras.metrics.RootMeanSquaredError()])
    history = model.fit(X, y, 
                        batch_size=16, 
                        epochs=50, 
                        shuffle=False,
                        )

In [None]:
pred_GRU = []
pred_LSTM = []
for gru, lstm, data in zip(GRUs, LSTMs, newarr[::-1]):
    pred_GRU = gru.predict(data).tolist() + pred_GRU
    pred_LSTM = lstm.predict(data).tolist() + pred_LSTM
    
pred_GRU = np.array(pred_GRU)
pred_LSTM = np.array(pred_LSTM)

X_tmp = np.concatenate((pred_GRU, pred_LSTM), axis=1)

In [None]:
pred_prices = scaler_label.inverse_transform(np.mean(np.array(pred_GRU), axis=1).reshape(-1, 1))
true_prices = scaler_label.inverse_transform(y_train.reshape(-1,1))

mse = mean_squared_error(true_prices, pred_prices)
rmse = mean_squared_error(true_prices, pred_prices, squared=False)
mae = mean_absolute_error(true_prices, pred_prices)

print(mse, rmse, mae)

pred_prices = scaler_label.inverse_transform(np.mean(np.array(pred_LSTM), axis=1).reshape(-1, 1))

mse = mean_squared_error(true_prices, pred_prices)
rmse = mean_squared_error(true_prices, pred_prices, squared=False)
mae = mean_absolute_error(true_prices, pred_prices)

print(mse, rmse, mae)

In [None]:
plt.figure(figsize=(26, 10))

plt.plot(pred_GRU, label='GRU')
plt.plot(pred_LSTM, label='LSTM')
plt.plot(y_train, label='Ground Truth')
plt.legend()
plt.title('Predictions without Sentiment Analysis')

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

plt.show()

# MLP

In [None]:
batch_size = 8
model = Sequential([Dense(1)])

model.compile(optimizer= Adam(learning_rate=5e-4), loss='mse', metrics=['accuracy'])

model.fit(X_tmp, y_train, batch_size = batch_size, epochs=50, verbose = 1)

In [None]:
pred_GRU = [model.predict(X_test) for model in GRUs]
pred_LSTM = [model.predict(X_test) for model in LSTMs]

pred_GRU = np.mean(np.array(pred_GRU), axis=0)
pred_LSTM = np.mean(np.array(pred_LSTM), axis=0)

X_tmp2 = np.concatenate((pred_GRU, pred_LSTM), axis=1).squeeze()

prediction = model.predict(X_tmp2)

In [None]:
pred_prices = scaler_label.inverse_transform(prediction)
true_prices = scaler_label.inverse_transform(y_test.reshape(-1,1))

mse = mean_squared_error(true_prices, pred_prices)
rmse = mean_squared_error(true_prices, pred_prices, squared=False)
mae = mean_absolute_error(true_prices, pred_prices)

metrics = '\n'.join((
    '$MSE=%f$' % (mse),
    '$RMSE=%f$' % (rmse),
    '$MAE=%f$' % (mae)
))

plt.figure(figsize=(26, 10))
plt.plot(pred_prices, label='Prediction Price')
plt.plot(true_prices, label='Ground Truth')
plt.legend()
plt.title('Predictions for Meta-Model (1 day)')

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
plt.text(0, 30000, metrics, fontsize=14,
        verticalalignment='top', bbox=props)

# plt.savefig('no_senti.png', dpi=500)
plt.show()