In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
WINDOW_SIZE = 10
SPLIT_RATIO = 0.8

In [3]:
df = pd.read_csv("../data/btc_historical.csv", sep=";", parse_dates=["timestamp"])
cols = ["timestamp", "open", "close", "volume", "marketCap"]
df = df[cols].copy()
df = df.sort_values("timestamp").reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,open,close,volume,marketCap
0,2024-09-19 23:59:59.999000+00:00,61651.155338,62940.456766,42710250000.0,1243490000000.0
1,2024-09-20 23:59:59.999000+00:00,62941.427595,63192.975083,35177160000.0,1248318000000.0
2,2024-09-21 23:59:59.999000+00:00,63184.341268,63394.838743,14408620000.0,1252541000000.0
3,2024-09-22 23:59:59.999000+00:00,63396.803553,63648.70989,20183350000.0,1257435000000.0
4,2024-09-23 23:59:59.999000+00:00,63643.10208,63329.802387,31400290000.0,1251215000000.0


In [4]:
df["avg_price"] = (df["open"] + df["close"]) / 2
df.head()

Unnamed: 0,timestamp,open,close,volume,marketCap,avg_price
0,2024-09-19 23:59:59.999000+00:00,61651.155338,62940.456766,42710250000.0,1243490000000.0,62295.806052
1,2024-09-20 23:59:59.999000+00:00,62941.427595,63192.975083,35177160000.0,1248318000000.0,63067.201339
2,2024-09-21 23:59:59.999000+00:00,63184.341268,63394.838743,14408620000.0,1252541000000.0,63289.590005
3,2024-09-22 23:59:59.999000+00:00,63396.803553,63648.70989,20183350000.0,1257435000000.0,63522.756721
4,2024-09-23 23:59:59.999000+00:00,63643.10208,63329.802387,31400290000.0,1251215000000.0,63486.452233


In [5]:
def make_windows(data):
    X, y = [], []
    for i in range(WINDOW_SIZE, len(data)):
        X.append(data["avg_price"].iloc[i - WINDOW_SIZE:i].values)
        y.append(data["avg_price"].iloc[i])
    return np.array(X), np.array(y)

In [6]:
train_size = int(len(df) * SPLIT_RATIO)
train_df = df.iloc[:train_size].reset_index(drop=True)
test_df = df.iloc[train_size - WINDOW_SIZE:].reset_index(drop=True)

In [7]:
X_train, y_train = make_windows(train_df)
X_test, y_test = make_windows(test_df)

In [8]:
prevs = X_train[:, -1]
dir_train = (y_train > prevs).astype(int)
prevs_test = X_test[:, -1]
dir_test = (y_test > prevs_test).astype(int)

In [9]:
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_train_flat = X_train.reshape(-1, 1)
scaler_X.fit(X_train_flat)
X_train_scaled = scaler_X.transform(X_train_flat).reshape(X_train.shape[0], WINDOW_SIZE)
X_test_flat = X_test.reshape(-1, 1)
X_test_scaled = scaler_X.transform(X_test_flat).reshape(X_test.shape[0], WINDOW_SIZE)

In [10]:
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_train = y_train.reshape(-1, 1)
scaler_y.fit(y_train)
y_train_scaled = scaler_y.transform(y_train).flatten()

In [11]:
np.savez(
    "../data/processed_data.npz",
    X_train=X_train_scaled,
    y_train=y_train_scaled,
    X_test=X_test_scaled,
    y_test=y_test,
    dir_train=dir_train,
    dir_test=dir_test,
    window_size=WINDOW_SIZE,
)

In [12]:
joblib.dump(scaler_X, "../models/scaler_X.pkl")
joblib.dump(scaler_y, "../models/scaler_y.pkl")

['../models/scaler_y.pkl']