In [1]:
import os
import time
import numpy as np
import pandas as pd
import ta  # Install via: pip install ta
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, LSTM, Dense, Concatenate, TimeDistributed, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping
import optuna

In [2]:
# =======================================================
# PART 1: Data Processing & Feature Engineering
# =======================================================

# Desired date range (modify as needed)
start_date_str = "02/01/2018"  # dayfirst format
end_date_str   = "28/02/2025"
start_date_filter = pd.to_datetime(start_date_str, dayfirst=True)
end_date_filter   = pd.to_datetime(end_date_str, dayfirst=True)
raw_data_dir = "./data"           # Folder with raw CSV files (e.g., AAPL.csv, SPY.csv)
filtered_data_dir = "./filtered_data_optimizations"


In [3]:
os.makedirs(filtered_data_dir, exist_ok=True)

def calculate_tema(series, window):
    """
    Calculate Triple Exponential Moving Average (TEMA):
    TEMA = 3*EMA1 - 3*EMA2 + EMA3
    """
    ema1 = ta.trend.EMAIndicator(close=series, window=window, fillna=False).ema_indicator()
    ema2 = ta.trend.EMAIndicator(close=ema1, window=window, fillna=False).ema_indicator()
    ema3 = ta.trend.EMAIndicator(close=ema2, window=window, fillna=False).ema_indicator()
    return 3 * ema1 - 3 * ema2 + ema3

def process_csv(file_path, filename):
    try:
        if filename.upper() == "SPY.CSV":
            df = pd.read_csv(file_path, parse_dates=["Date"], dayfirst=True)
        else:
            df = pd.read_csv(file_path, parse_dates=["Date"], dayfirst=False)
        df.dropna(subset=["Date"], inplace=True)
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce", utc=True).dt.tz_localize(None)
        required_columns = ["Close", "High", "Low", "Open", "Volume"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Warning: {filename} missing {missing_columns}")
            return None
        print(f"{filename} - Date Range: {df['Date'].min()} to {df['Date'].max()}")
        df_filtered = df[(df["Date"] >= start_date_filter) & (df["Date"] <= end_date_filter)].copy()
        if df_filtered.empty:
            print(f"Warning: {filename} has no data in the date range.")
            return None
        df_filtered = df_filtered[["Date", "Open", "High", "Low", "Close", "Volume"]]
        # Standard Technical Indicators
        for window in [14, 26, 50, 100, 200]:
            df_filtered[f"SMA_{window}"] = ta.trend.SMAIndicator(close=df_filtered["Close"], window=window, fillna=False).sma_indicator()
            df_filtered[f"EMA_{window}"] = ta.trend.EMAIndicator(close=df_filtered["Close"], window=window, fillna=False).ema_indicator()
            df_filtered[f"TEMA_{window}"] = calculate_tema(df_filtered["Close"], window)
        bb_indicator = ta.volatility.BollingerBands(close=df_filtered["Close"], window=20, window_dev=2, fillna=False)
        df_filtered["BB_Hband"] = bb_indicator.bollinger_hband()
        df_filtered["BB_Mband"] = bb_indicator.bollinger_mavg()
        df_filtered["BB_Lband"] = bb_indicator.bollinger_lband()
        df_filtered["RSI_14"] = ta.momentum.RSIIndicator(close=df_filtered["Close"], window=14, fillna=False).rsi()
        macd_indicator = ta.trend.MACD(close=df_filtered["Close"], window_slow=26, window_fast=12, window_sign=9, fillna=False)
        df_filtered["MACD"] = macd_indicator.macd()
        df_filtered["MACD_Signal"] = macd_indicator.macd_signal()
        df_filtered["MACD_Hist"] = macd_indicator.macd_diff()
        df_filtered["Mean_HL"] = (df_filtered["High"] + df_filtered["Low"]) / 2.0
        df_filtered["RMom_14"] = df_filtered["Close"] / df_filtered["Close"].shift(14)
        for window in [14, 26, 50, 100, 200]:
            df_filtered[f"MomTEMA_{window}_ofs1"] = df_filtered[f"TEMA_{window}"] / df_filtered[f"TEMA_{window}"].shift(1)
            df_filtered[f"RCTEMA_{window}"] = df_filtered["Close"] / df_filtered[f"TEMA_{window}"]
            df_filtered[f"MomEMA_{window}_ofs1"] = df_filtered[f"EMA_{window}"] / df_filtered[f"EMA_{window}"].shift(1)
        df_filtered["RTEMA_TEMA_14_50"] = df_filtered["TEMA_14"] / df_filtered["TEMA_50"]
        df_filtered["REMA_EMA_14_50"] = df_filtered["EMA_14"] / df_filtered["EMA_50"]
        df_filtered["RSMA_SMA_14_50"] = df_filtered["SMA_14"] / df_filtered["SMA_50"]
        df_filtered["RVolSMA_20"] = df_filtered["Volume"] / df_filtered["Volume"].rolling(window=20).mean()
        df_filtered.dropna(inplace=True)
        df_filtered["Date"] = df_filtered["Date"].dt.strftime("%Y-%m-%d")
        return df_filtered
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return None

# Process all CSV files in raw_data_dir
for filename in os.listdir(raw_data_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(raw_data_dir, filename)
        processed_df = process_csv(file_path, filename)
        if processed_df is not None:
            output_path = os.path.join(filtered_data_dir, filename)
            processed_df.to_csv(output_path, index=False)
            print(f"Processed {filename} -> {output_path}")

print("Processing complete.")

AAPL.csv - Date Range: 1980-12-12 05:00:00 to 2025-03-03 05:00:00
Processed AAPL.csv -> ./filtered_data_optimizations\AAPL.csv
ABBV.csv - Date Range: 2013-01-02 05:00:00 to 2025-03-03 05:00:00
Processed ABBV.csv -> ./filtered_data_optimizations\ABBV.csv
ADBE.csv - Date Range: 1986-08-13 04:00:00 to 2025-03-03 05:00:00
Processed ADBE.csv -> ./filtered_data_optimizations\ADBE.csv
AMD.csv - Date Range: 1980-03-17 05:00:00 to 2025-03-03 05:00:00
Processed AMD.csv -> ./filtered_data_optimizations\AMD.csv
AMT.csv - Date Range: 1998-02-27 05:00:00 to 2025-03-03 05:00:00
Processed AMT.csv -> ./filtered_data_optimizations\AMT.csv
AMZN.csv - Date Range: 1997-05-15 04:00:00 to 2025-03-03 05:00:00
Processed AMZN.csv -> ./filtered_data_optimizations\AMZN.csv
BA.csv - Date Range: 1962-01-02 05:00:00 to 2025-03-03 05:00:00
Processed BA.csv -> ./filtered_data_optimizations\BA.csv
BAC.csv - Date Range: 1973-02-21 05:00:00 to 2025-03-03 05:00:00
Processed BAC.csv -> ./filtered_data_optimizations\BAC.csv

In [4]:
# -------------------------------------------------------
# PART 2: Load Filtered Data & Prepare Sequences
# -------------------------------------------------------

def load_csv_data(filepath):
    df = pd.read_csv(filepath)
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
    df.sort_values("Date", inplace=True)
    return df

def get_model_features(df):
    desired_features = [
        "Open", "High", "Low", "Close", "Volume",
        "RSI_14",
        "SMA_14", "SMA_26", "SMA_50", "SMA_100", "SMA_200",
        "EMA_14", "EMA_26", "EMA_50", "EMA_100", "EMA_200",
        "BB_Hband", "BB_Mband", "BB_Lband",
        "Mean_HL", "MACD", "MACD_Signal", "MACD_Hist",
        "RMom_14", "MomTEMA_14_ofs1", "MomTEMA_26_ofs1", "MomTEMA_50_ofs1", "MomTEMA_100_ofs1", "MomTEMA_200_ofs1",
        "RCTEMA_14", "RCTEMA_26", "RCTEMA_50", "RCTEMA_100", "RCTEMA_200",
        "MomEMA_14_ofs1", "MomEMA_26_ofs1", "MomEMA_50_ofs1", "MomEMA_100_ofs1", "MomEMA_200_ofs1",
        "RTEMA_TEMA_14_50", "REMA_EMA_14_50", "RSMA_SMA_14_50",
        "RVolSMA_20"
    ]
    for col in desired_features:
        if col not in df.columns:
            df[col] = 0.0
    return df[["Date"] + desired_features].copy()

def prepare_sequences(df, seq_length):
    dates = df["Date"].values
    df_features = df.drop(columns=["Date"])
    data_array = df_features.values
    X, y, seq_dates = [], [], []
    target_index = df_features.columns.get_loc("Close")
    for i in range(seq_length, len(data_array)):
        X.append(data_array[i-seq_length:i])
        y.append(data_array[i, target_index])
        seq_dates.append(dates[i])
    return np.array(X), np.array(y), np.array(seq_dates, dtype='datetime64[ns]')

sequence_length = 20  # fixed sequence length

X_list, y_list, dates_list, ticker_list = [], [], [], []
all_tickers = []

for filename in os.listdir(filtered_data_dir):
    if filename.endswith(".csv"):
        ticker = filename.split(".csv")[0]
        all_tickers.append(ticker)
        filepath = os.path.join(filtered_data_dir, filename)
        df_raw = load_csv_data(filepath)
        df_feat = get_model_features(df_raw)
        if len(df_feat) > sequence_length:
            X, y, seq_dates = prepare_sequences(df_feat, sequence_length)
            X_list.append(X)
            y_list.append(y)
            dates_list.append(seq_dates)
            ticker_list.extend([ticker] * len(y))

X_all = np.concatenate(X_list, axis=0)
y_all = np.concatenate(y_list, axis=0)
dates_all = np.concatenate(dates_list, axis=0)
num_features = X_all.shape[2]

# Global scaling using RobustScaler
scaler = RobustScaler()
all_data = X_all.reshape(-1, num_features)
scaler.fit(all_data)
def scale_sequences(X, scaler):
    return np.array([scaler.transform(seq) for seq in X])
X_all_scaled = scale_sequences(X_all, scaler)

In [5]:
# -------------------------------------------------------
# PART 3: Walk-Forward Split Function for Sequences
# -------------------------------------------------------
def walk_forward_split(X, y, dates, train_window, test_window):
    """
    Split sequences (X, y) into multiple folds based on the dates.
    Data is sorted by dates before splitting.
    """
    sorted_idx = np.argsort(dates)
    X = X[sorted_idx]
    y = y[sorted_idx]
    dates = dates[sorted_idx]
    splits = []
    start_idx = 0
    while start_idx + train_window + test_window <= len(X):
        X_train = X[start_idx : start_idx + train_window]
        y_train = y[start_idx : start_idx + train_window]
        X_test = X[start_idx + train_window : start_idx + train_window + test_window]
        y_test = y[start_idx + train_window : start_idx + train_window + test_window]
        splits.append((X_train, y_train, X_test, y_test))
        start_idx += test_window
    return splits

In [6]:
# -------------------------------------------------------
# PART 4: Define Distribution Strategy
# -------------------------------------------------------
strategy = tf.distribute.MirroredStrategy()
print("Number of devices:", strategy.num_replicas_in_sync)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices: 1


In [7]:
# -------------------------------------------------------
# PART 5: Build Model for Hyperparameter Optimization
# -------------------------------------------------------
def build_model_hp(params):
    with strategy.scope():
        inputs = Input(shape=(sequence_length, num_features))
        # TimeDistributed feature extraction (parameterizable)
        td_units = params.get('td_units', 64)
        x = TimeDistributed(Dense(td_units, activation='selu'))(inputs)
        # Short-term branch (processing last few timesteps)
        short_timesteps = params.get('short_timesteps', 5)
        short_term = Lambda(lambda x: x[:, -short_timesteps:, :])(x)
        gru1_units = params.get('gru1_units', 64)
        gru2_units = params.get('gru2_units', 32)
        gru1_dropout = params.get('gru1_dropout', 0.2)
        gru2_dropout = params.get('gru2_dropout', 0.1)
        short_branch = GRU(gru1_units, return_sequences=True, recurrent_dropout=gru1_dropout)(short_term)
        short_branch = GRU(gru2_units, recurrent_dropout=gru2_dropout)(short_branch)
        # Long-term branch (processing full sequence)
        lstm1_units = params.get('lstm1_units', 128)
        lstm2_units = params.get('lstm2_units', 64)
        lstm3_units = params.get('lstm3_units', 32)
        lstm1_dropout = params.get('lstm1_dropout', 0.2)
        lstm2_dropout = params.get('lstm2_dropout', 0.1)
        lstm3_dropout = params.get('lstm3_dropout', 0.1)
        long_branch = LSTM(lstm1_units, return_sequences=True, recurrent_dropout=lstm1_dropout)(x)
        long_branch = LSTM(lstm2_units, return_sequences=True, recurrent_dropout=lstm2_dropout)(long_branch)
        long_branch = LSTM(lstm3_units, recurrent_dropout=lstm3_dropout)(long_branch)
        # Merge branches
        merged = Concatenate()([short_branch, long_branch])
        dense_units = params.get('dense_units', 32)
        dense_out = Dense(dense_units, activation='selu')(merged)
        outputs = Dense(1)(dense_out)
        model = Model(inputs=inputs, outputs=outputs)
        lr = params.get('learning_rate', 5e-4)
        huber_delta = params.get('huber_delta', 1.5)
        model.compile(optimizer=Adam(learning_rate=lr),
                      loss=Huber(delta=huber_delta),
                      metrics=['mae'])
    return model

In [10]:
import optuna
optuna.logging.set_verbosity(optuna.logging.INFO)

# -------------------------------------------------------
# PART 6: Hyperparameter Optimization Objective Function with Monitoring
# -------------------------------------------------------
def objective(trial):
    # Suggest hyperparameters for various parts of the model
    params = {
        'td_units': trial.suggest_int('td_units', 32, 128),
        'short_timesteps': trial.suggest_int('short_timesteps', 3, 10),
        'gru1_units': trial.suggest_int('gru1_units', 32, 128),
        'gru2_units': trial.suggest_int('gru2_units', 16, 64),
        'gru1_dropout': trial.suggest_float('gru1_dropout', 0.1, 0.4),
        'gru2_dropout': trial.suggest_float('gru2_dropout', 0.05, 0.3),
        'lstm1_units': trial.suggest_int('lstm1_units', 64, 256),
        'lstm2_units': trial.suggest_int('lstm2_units', 32, 128),
        'lstm3_units': trial.suggest_int('lstm3_units', 16, 64),
        'lstm1_dropout': trial.suggest_float('lstm1_dropout', 0.1, 0.4),
        'lstm2_dropout': trial.suggest_float('lstm2_dropout', 0.05, 0.3),
        'lstm3_dropout': trial.suggest_float('lstm3_dropout', 0.05, 0.3),
        'dense_units': trial.suggest_int('dense_units', 16, 64),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True),
        'huber_delta': trial.suggest_float('huber_delta', 1.0, 2.0)
    }
    
    # Set training parameters for each fold during optimization
    epochs = 20
    batch_size = 16
    
    # Determine train and test window sizes based on total number of samples
    N = X_all_scaled.shape[0]
    train_window = int(0.6 * N)
    test_window  = int(0.1 * N)
    
    splits = walk_forward_split(X_all_scaled, y_all, dates_all, train_window, test_window)
    # If no valid split is obtained, fallback to one simple split.
    if not splits:
        X_train_fold = X_all_scaled[:int(0.8 * N)]
        y_train_fold = y_all[:int(0.8 * N)]
        X_test_fold = X_all_scaled[int(0.8 * N):]
        y_test_fold = y_all[int(0.8 * N):]
        splits = [(X_train_fold, y_train_fold, X_test_fold, y_test_fold)]
    
    fold_losses = []
    print(f"Trial {trial.number}: Starting evaluation over {len(splits)} fold(s)...")
    for fold_index, (X_train_fold, y_train_fold, X_test_fold, y_test_fold) in enumerate(splits, start=1):
        print(f"Trial {trial.number}: Processing fold {fold_index}/{len(splits)}...")
        model = build_model_hp(params)
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        history = model.fit(
            X_train_fold, y_train_fold, 
            epochs=epochs, 
            batch_size=batch_size,
            validation_data=(X_test_fold, y_test_fold),
            callbacks=[early_stop],
            verbose=0
        )
        loss, _ = model.evaluate(X_test_fold, y_test_fold, verbose=0)
        print(f"Trial {trial.number}, Fold {fold_index}: Loss = {loss:.4f}")
        fold_losses.append(loss)
    
    mean_loss = np.mean(fold_losses)
    print(f"Trial {trial.number}: Mean Loss = {mean_loss:.4f}")
    return mean_loss

In [None]:
# -------------------------------------------------------
# PART 7: Run Hyperparameter Optimization Study with Progress Bar
# -------------------------------------------------------
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(direction='minimize', pruner=pruner)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best hyperparameters:", study.best_params)

[I 2025-03-31 22:15:42,451] A new study created in memory with name: no-name-ec6357cf-a3f0-42c1-886c-de76a1d2c26d


  0%|          | 0/50 [00:00<?, ?it/s]

Trial 0: Starting evaluation over 4 fold(s)...
Trial 0: Processing fold 1/4...


In [None]:
# -------------------------------------------------------
# PART 8: Rebuild Final Model and Train on Full Training Data
# -------------------------------------------------------

# (Option 1) Use a final train-test split based on time ordering.
sorted_idx = np.argsort(dates_all)
X_all_sorted = X_all_scaled[sorted_idx]
y_all_sorted = y_all[sorted_idx]
N = X_all_sorted.shape[0]
train_end = int(0.8 * N)
X_train_final = X_all_sorted[:train_end]
y_train_final = y_all_sorted[:train_end]
X_test_final  = X_all_sorted[train_end:]
y_test_final  = y_all_sorted[train_end:]

final_params = study.best_params  # Use best hyperparameters from Optuna

final_model = build_model_hp(final_params)
final_early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

history = final_model.fit(
    X_train_final, y_train_final,
    epochs=100,
    batch_size=16,
    validation_split=0.2,
    callbacks=[final_early_stop],
    verbose=1
)

loss, mae = final_model.evaluate(X_test_final, y_test_final)
print("Final Test Loss:", loss)
print("Final Test MAE:", mae)