In [22]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

from src.processing.splitting import time_based_split
from src.processing.scaling import scale_features
from src.processing.sequence_creator import create_sequences
from src.training.lstm_training import train_lstm
from src.training.hyperparameters_tuning import tune_hyperparameters

from config import config


df_btc = pd.read_csv(
    config.DATA_DIR / "processed" / "crypto_prices" / "btc.csv",
    parse_dates=["date"],
    index_col="date",
)
df_eth = pd.read_csv(
    config.DATA_DIR / "processed" / "crypto_prices" / "eth.csv",
    parse_dates=["date"],
    index_col="date",
)

## Split


In [23]:
X_train_btc, y_train_btc, X_val_btc, y_val_btc, X_test_btc, y_test_btc = (
    time_based_split(
        df_btc,
        targets=["logPriceChange"],
        test_months=12,
        val_months=3,
        lags=30,
    )
)

X_train_eth, y_train_eth, X_val_eth, y_val_eth, X_test_eth, y_test_eth = (
    time_based_split(
        df_eth,
        targets=["logPriceChange"],
        test_months=12,
        val_months=3,
        lags=30,
    )
)

In [24]:
print(y_test_btc.tail())
print(y_test_btc.head())

            logPriceChange
date                      
2022-12-14       -0.016406
2022-12-15       -0.042190
2022-12-16        0.008828
2022-12-19        0.027989
2022-12-20       -0.005265
            logPriceChange
date                      
2021-12-02       -0.052332
2021-12-03       -0.085608
2021-12-06        0.002319
2021-12-07       -0.003859
2021-12-08       -0.057722


## Scale


In [25]:
X_train_scaled_btc, X_val_scaled_btc, X_test_scaled_btc, scaler_btc = scale_features(
    X_train_btc, X_val_btc, X_test_btc
)

X_train_scaled_eth, X_val_scaled_eth, X_test_scaled_eth, scaler_eth = scale_features(
    X_train_eth, X_val_eth, X_test_eth
)

## Sequence Creation


In [26]:
X_train_seq_btc, y_train_seq_btc = create_sequences(
    X_train_scaled_btc, y_train_btc, seq_length=30
)
X_val_seq_btc, y_val_seq_btc = create_sequences(
    X_val_scaled_btc, y_val_btc, seq_length=30
)
X_test_seq_btc, y_test_seq_btc = create_sequences(
    X_test_scaled_btc, y_test_btc, seq_length=30
)

print(f"X_train_seq_btc shape: {X_train_seq_btc.shape}")
print(f"y_train_seq_btc shape: {y_train_seq_btc.shape}")

X_train_seq_eth, y_train_seq_eth = create_sequences(
    X_train_scaled_eth, y_train_eth, seq_length=30
)
X_val_seq_eth, y_val_seq_eth = create_sequences(
    X_val_scaled_eth, y_val_eth, seq_length=30
)
X_test_seq_eth, y_test_seq_eth = create_sequences(
    X_test_scaled_eth, y_test_eth, seq_length=30
)

X_train_seq_btc shape: (1144, 30, 79)
y_train_seq_btc shape: (1144, 1)


# BTC


### Model Tuning

In [27]:
study_lstm_btc = tune_hyperparameters(
    X_train_seq_btc,
    y_train_seq_btc,
    X_val_seq_btc,
    y_val_seq_btc,
    train_lstm,
    n_trials=50,
)

In [28]:
len(y_test_seq_btc)

235

## Model Training

In [29]:
# best parameters from the Optuna study
best_params_btc = study_lstm_btc.best_params
# best_params_btc = {
#     "num_lstm_layers": 3,
#     "lstm_units_1": 192,
#     "lstm_units_2": 128,
#     "lstm_units_3": 32,
#     "num_dense_layers": 2,
#     "dense_units_1": 224,
#     "dense_units_2": 192,
#     "dropout_rate": 0.4,
#     "learning_rate": 0.001277122777959738,
#     "batch_size": 64,
# }


# Format the parameters correctly
formatted_params = {
    "lstm_units": [
        best_params_btc[f"lstm_units_{i+1}"] for i in range(best_params_btc["num_lstm_layers"])
    ],
    "dense_units": [
        best_params_btc[f"dense_units_{i+1}"]
        for i in range(best_params_btc["num_dense_layers"])
    ],
    "dropout_rate": best_params_btc["dropout_rate"],
    "learning_rate": best_params_btc["learning_rate"],
    "batch_size": best_params_btc["batch_size"],
    "epochs": 300,
}

# Train the model with the formatted parameters
model_btc, history_btc = train_lstm(
    X_train_seq_btc,
    y_train_seq_btc,
    X_val_seq_btc,
    y_val_seq_btc,
    params=formatted_params,
)

Epoch 1/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 96ms/step - loss: 0.0149 - mae: 0.0941 - val_loss: 9.4738e-04 - val_mae: 0.0249
Epoch 2/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 98ms/step - loss: 0.0056 - mae: 0.0568 - val_loss: 0.0039 - val_mae: 0.0558
Epoch 3/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 91ms/step - loss: 0.0044 - mae: 0.0489 - val_loss: 9.5736e-04 - val_mae: 0.0252
Epoch 4/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 91ms/step - loss: 0.0027 - mae: 0.0370 - val_loss: 9.1096e-04 - val_mae: 0.0242
Epoch 5/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - loss: 0.0022 - mae: 0.0337 - val_loss: 9.4302e-04 - val_mae: 0.0244
Epoch 6/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 85ms/step - loss: 0.0022 - mae: 0.0327 - val_loss: 9.1330e-04 - val_mae: 0.0241
Epoch 7/300
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [30]:
# save weights
model_btc.save_weights(
    config.DATA_DIR
    / "weights"
    / "multi_source_data"
    / "log_price_change_weights_btc.weights.h5"
)

#  generate predictions
y_pred_btc = model_btc.predict(X_test_seq_btc).flatten()

# create new df and save it to csv
predictions_btc = pd.DataFrame(
    {
        "date": y_test_btc.index[30:],
        "predictedLogPriceChange": y_pred_btc,
    }
)


predictions_btc.to_csv(
    config.DATA_DIR
    / "predictions"
    / "multi_source_data"
    / "log_price_change_predictions_btc.csv",
    index=False,
)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


# ETH

### Model Tuning

In [31]:
study_lstm_eth = tune_hyperparameters(
    X_train_seq_eth,
    y_train_seq_eth,
    X_val_seq_eth,
    y_val_seq_eth,
    train_lstm,
    n_trials=50,
)

## Model Training

In [32]:
# best parameters from the Optuna study
best_params_eth = study_lstm_eth.best_params
# best_params_eth = {
#     "num_lstm_layers": 1,
#     "lstm_units_1": 224,
#     "num_dense_layers": 2,
#     "dense_units_1": 96,
#     "dense_units_2": 256,
#     "dropout_rate": 0.2,
#     "learning_rate": 0.0014383064200000338,
#     "batch_size": 32,
# }


# Format the parameters correctly
formatted_params = {
    "lstm_units": [
        best_params_eth[f"lstm_units_{i+1}"] for i in range(best_params_eth["num_lstm_layers"])
    ],
    "dense_units": [
        best_params_eth[f"dense_units_{i+1}"]
        for i in range(best_params_eth["num_dense_layers"])
    ],
    "dropout_rate": best_params_eth["dropout_rate"],
    "learning_rate": best_params_eth["learning_rate"],
    "batch_size": best_params_eth["batch_size"],
    "epochs": 300,
}

# Train the model with the formatted parameters
model_eth, history_eth = train_lstm(
    X_train_seq_eth,
    y_train_seq_eth,
    X_val_seq_eth,
    y_val_seq_eth,
    params=formatted_params,
)

Epoch 1/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 0.0384 - mae: 0.1422 - val_loss: 0.0108 - val_mae: 0.0853
Epoch 2/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 0.0187 - mae: 0.1053 - val_loss: 0.0099 - val_mae: 0.0914
Epoch 3/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0122 - mae: 0.0821 - val_loss: 0.0261 - val_mae: 0.1562
Epoch 4/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 0.0070 - mae: 0.0592 - val_loss: 0.0039 - val_mae: 0.0536
Epoch 5/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 0.0063 - mae: 0.0545 - val_loss: 0.0015 - val_mae: 0.0293
Epoch 6/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 0.0048 - mae: 0.0483 - val_loss: 0.0022 - val_mae: 0.0355
Epoch 7/300
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - 

In [33]:
# save weights
model_eth.save_weights(
    config.DATA_DIR
    / "weights"
    / "multi_source_data"
    / "log_price_change_weights_eth.weights.h5"
)

#  generate predictions
y_pred_eth = model_eth.predict(X_test_seq_eth).flatten()

# create new df and save it to csv
predictions_eth = pd.DataFrame(
    {
        "date": y_test_eth.index[30:],
        "predictedLogPriceChange": y_pred_eth,
    }
)


predictions_eth.to_csv(
    config.DATA_DIR
    / "predictions"
    / "multi_source_data"
    / "log_price_change_predictions_eth.csv",
    index=False,
)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
