In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
WINDOW_SIZE = 10
SPLIT_RATIO = 0.8
FILE_PATH = "../data/AA.csv"

In [3]:
df = pd.read_csv(FILE_PATH, sep=",", parse_dates=["Date"])
cols = ["Date", "Open", "Close"]
df = df[cols].copy()
df = df.sort_values("Date").reset_index(drop=True)
df.head()

Unnamed: 0,Date,Open,Close
0,1962-01-02,6.532155,6.532155
1,1962-01-03,6.532155,6.63228
2,1962-01-04,6.63228,6.63228
3,1962-01-05,6.63228,6.62427
4,1962-01-08,6.60825,6.408


In [4]:
df["avg_price"] = (df["Open"] + df["Close"]) / 2
df.head()

Unnamed: 0,Date,Open,Close,avg_price
0,1962-01-02,6.532155,6.532155,6.532155
1,1962-01-03,6.532155,6.63228,6.582217
2,1962-01-04,6.63228,6.63228,6.63228
3,1962-01-05,6.63228,6.62427,6.628275
4,1962-01-08,6.60825,6.408,6.508125


In [5]:
def make_windows(data):
    X, y = [], []
    for i in range(WINDOW_SIZE, len(data)):
        X.append(data["avg_price"].iloc[i - WINDOW_SIZE:i].values)
        y.append(data["avg_price"].iloc[i])
    return np.array(X), np.array(y)

In [6]:
train_size = int(len(df) * SPLIT_RATIO)
train_df = df.iloc[:train_size].reset_index(drop=True)
test_df = df.iloc[train_size - WINDOW_SIZE:].reset_index(drop=True)

In [7]:
X_train, y_train = make_windows(train_df)
X_test, y_test = make_windows(test_df)

In [8]:
prevs = X_train[:, -1]
dir_train = (y_train > prevs).astype(int)
prevs_test = X_test[:, -1]
dir_test = (y_test > prevs_test).astype(int)

In [9]:
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_train_flat = X_train.reshape(-1, 1)
scaler_X.fit(X_train_flat)
X_train_scaled = scaler_X.transform(X_train_flat).reshape(X_train.shape[0], WINDOW_SIZE)
X_test_flat = X_test.reshape(-1, 1)
X_test_scaled = scaler_X.transform(X_test_flat).reshape(X_test.shape[0], WINDOW_SIZE)

In [10]:
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_train = y_train.reshape(-1, 1)
scaler_y.fit(y_train)
y_train_scaled = scaler_y.transform(y_train).flatten()

In [11]:
np.savez(
    "../data/processed_data.npz",
    X_train=X_train_scaled,
    y_train=y_train_scaled,
    X_test=X_test_scaled,
    y_test=y_test,
    dir_train=dir_train,
    dir_test=dir_test,
    window_size=WINDOW_SIZE,
)

In [12]:
joblib.dump(scaler_X, "../models/scaler_X.pkl")
joblib.dump(scaler_y, "../models/scaler_y.pkl")

['../models/scaler_y.pkl']