In [None]:
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # data visualization
import matplotlib.dates as mdates
import warnings
from dateutil.relativedelta import relativedelta
from functions import *
import seaborn as sns
import os.path
import statsmodels.api as sm

warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

showGraphs = True

%matplotlib inline


In [None]:
# Any results you write to the current directory are saved as output.

path_to_file = "./data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv"

if not (os.path.exists(path_to_file)):
    raise Exception("File not found. Please download the file from the link below and place it in the data folder https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data")

coinbase = pd.read_csv(path_to_file)

coinbase.describe()
coinbase.tail()
coinbase.info()


Filtrar intervalo de dados

In [None]:
filtered = filterByInterval(coinbase, relativedelta(months=24))

if hasMissingData(filtered['Timestamp'].values):
    raise Exception("Missing data in the dataframe")

print("O intervalo entre datas é de {} até {}, somando um total de {} registros.".format(
    getFirstTimestamp(filtered), getLastTimestamp(filtered), filtered.shape[0]))


Prepara os dados históricos agrupandos por hora

In [None]:
if showGraphs:
    kw = dict(annot_yaxis=10000, annot_xaxis=-1000, annot_xaxis_pos=-6000, annot_yaxis_pos=-
              8000, angle="angle,angleA=0,angleB=90", angle_pos="angle,angleA=0,angleB=-90")
    figureCloses(filtered, 'Close', **kw)

In [None]:
# if showGraphs:
    # kw = dict(showAnnotate=False)
#     figureCloses(filtered, 'Volume_(BTC)', **kw)

    # figureCloses(filtered, 'Volume_(Currency)', **kw)

#     figureCloses(filtered, 'Weighted_Price', **kw)


In [None]:
historical = filtered.dropna().reset_index(drop=True)
historical.Timestamp = pd.to_datetime(historical.Timestamp, unit='s')
# historical['date'] = historical.Timestamp.dt.date
historical['dateHour'] = historical.Timestamp.dt.strftime('%Y-%m-%d %H')
historical_per_hour = historical.groupby(historical.dateHour).mean()

Gráfico Pairplot

In [None]:


plt.figure(figsize = (30,30))
sns.pairplot(historical_per_hour, vars=['Close', 'Volume_(BTC)', 'Volume_(Currency)'])
sns.pairplot(historical_per_hour, vars=['Close', 'High', 'Low', 'Weighted_Price'])

Preparação dos Dados

In [None]:
data = filtered.Close
data = data.values
max = np.max(data)
data = data/max

Separar dados de Treino e Teste

In [None]:
X = [data[i:i+10] for i in range(len(data)-11)]
Y = [data[i+10] for i in range(len(data)-11)]
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, shuffle=False)
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

Criação do Modelo

In [None]:
np.random.seed(0)

lstm = LSTM(64, activation='relu')

model = Sequential([
    Bidirectional(lstm, input_shape=(1, 10)),
    Dropout(0.15),
    Dense(1)
])
model.compile(loss='mse', optimizer='adam')
history = model.fit(X_train, Y_train, epochs=20)


Treinamento do Modelo

In [None]:
pred_train = model.predict(X_train)

print("Mean absolute error regression loss: %.4f" %
      mean_absolute_error(Y_train, pred_train))
print("Root mean squared error regression loss: %.4f" %
      np.sqrt(mean_squared_error(Y_train, pred_train)))
print("R2 score: %.4f" % r2_score(Y_train, pred_train))


Gráfico da função de perda

In [None]:
if showGraphs:
    plt.figure(figsize=(8, 8))
    plt.title("Loss evolution")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.plot(history.history['loss'])

Gráfico da predição comparado com os dados de teste

In [None]:
if showGraphs:
    preds1 = pred_train * max
    plt.figure(figsize=(12, 12))
    plt.plot(Y_train * max, c='red', label='predicted values', linewidth=2)
    plt.plot(preds1, c='blue', label='real values', linewidth=2)
    plt.xlabel("Days")
    plt.title("prediction on train data")
    plt.legend()


Gráfico da predição comparado com os dados de validação

In [None]:
if showGraphs:
    preds = model.predict(X_test) * max
    plt.figure(figsize=(14, 14))
    plt.plot(preds, c='red', label='predicted values', linewidth=2)
    plt.plot(Y_test * max, c='blue', label='real values', linewidth=2)
    plt.xlabel("Days")
    plt.title("prediction on test data")
    plt.legend()


Train using LSTM 

In [None]:
# # The LSTM architecture
# regressor = Sequential()
# # First LSTM layer with Dropout regularisation
# regressor.add(LSTM(units=50, return_sequences=True,
#               input_shape=(X_train.shape[1], 1)))
# regressor.add(Dropout(0.2))

# # Second LSTM layer
# regressor.add(LSTM(units=50, return_sequences=True))
# regressor.add(Dropout(0.2))

# # Third LSTM layer
# regressor.add(LSTM(units=50, return_sequences=True))
# regressor.add(Dropout(0.5))

# # Fourth LSTM layer
# regressor.add(LSTM(units=50))
# regressor.add(Dropout(0.5))

# # The output layer
# regressor.add(Dense(units=1))

# # Compiling the RNN
# regressor.compile(optimizer='adam', loss='mean_absolute_error')
# # Fitting to the training set
# regressor.fit(X_train, y_train, epochs=1, batch_size=500)


This is clearly overfitting