In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
WINDOW_SIZE = 30
SPLIT_RATIO = 0.8
FILE_PATH = "../data/btc_historical.csv"

In [3]:
df = pd.read_csv(FILE_PATH, sep=";", parse_dates=["Date"])
cols = ["Date", "Open", "Close"]
df = df[cols].copy()
df = df.sort_values("Date").reset_index(drop=True)
df.head()

Unnamed: 0,Date,Open,Close
0,2024-09-19 23:59:59.999000+00:00,61651.155338,62940.456766
1,2024-09-20 23:59:59.999000+00:00,62941.427595,63192.975083
2,2024-09-21 23:59:59.999000+00:00,63184.341268,63394.838743
3,2024-09-22 23:59:59.999000+00:00,63396.803553,63648.70989
4,2024-09-23 23:59:59.999000+00:00,63643.10208,63329.802387


In [4]:
df["avg_price"] = (df["Open"] + df["Close"]) / 2
df["price_change"] = df["Close"] - df["Open"]
df["price_change_pct"] = (df["Close"] - df["Open"]) / df["Open"]
df.head()

Unnamed: 0,Date,Open,Close,avg_price,price_change,price_change_pct
0,2024-09-19 23:59:59.999000+00:00,61651.155338,62940.456766,62295.806052,1289.301428,0.020913
1,2024-09-20 23:59:59.999000+00:00,62941.427595,63192.975083,63067.201339,251.547488,0.003997
2,2024-09-21 23:59:59.999000+00:00,63184.341268,63394.838743,63289.590005,210.497475,0.003331
3,2024-09-22 23:59:59.999000+00:00,63396.803553,63648.70989,63522.756721,251.906338,0.003973
4,2024-09-23 23:59:59.999000+00:00,63643.10208,63329.802387,63486.452233,-313.299694,-0.004923


In [5]:
def make_windows(data):
    X, y = [], []
    features = ["avg_price", "price_change", "price_change_pct"]
    for i in range(WINDOW_SIZE, len(data)):
        X.append(data[features].iloc[i - WINDOW_SIZE:i].values)
        y.append(data["avg_price"].iloc[i])
    return np.array(X), np.array(y)

In [6]:
train_size = int(len(df) * SPLIT_RATIO)
train_df = df.iloc[:train_size].reset_index(drop=True)
test_df = df.iloc[train_size - WINDOW_SIZE:].reset_index(drop=True)

In [7]:
X_train, y_train = make_windows(train_df)
X_test, y_test = make_windows(test_df)

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((288, 30, 3), (288,), (80, 30, 3), (80,))

In [9]:
prevs = X_train[:, -1, 0]
dir_train = (y_train > prevs).astype(int)
prevs_test = X_test[:, -1, 0]
dir_test = (y_test > prevs_test).astype(int)

In [10]:
scaler_X = MinMaxScaler(feature_range=(0, 1))

n_features = X_train.shape[2]
X_train_flat = X_train.reshape(-1, n_features)

scaler_X.fit(X_train_flat)
X_train_scaled = scaler_X.transform(X_train_flat).reshape(X_train.shape[0], WINDOW_SIZE, n_features)

X_test_flat = X_test.reshape(-1, n_features)
X_test_scaled = scaler_X.transform(X_test_flat).reshape(X_test.shape[0], WINDOW_SIZE, n_features)

In [11]:
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_train = y_train.reshape(-1, 1)
scaler_y.fit(y_train)
y_train_scaled = scaler_y.transform(y_train).flatten()

In [None]:
np.savez(
    "../data/processed_data.npz",
    X_train=X_train_scaled,
    y_train=y_train_scaled,
    X_test=X_test_scaled,
    y_test=y_test,
    dir_train=dir_train,
    dir_test=dir_test,
    window_size=WINDOW_SIZE,
    n_features=n_features
)

In [None]:
joblib.dump(scaler_X, "../models/scaler_X.pkl")
joblib.dump(scaler_y, "../models/scaler_y.pkl")

['../models/scaler_y_1.pkl']