In [1]:

import os
import time
import numpy as np
import pandas as pd
import ta  # Install via: pip install ta
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, LSTM, Dense, Concatenate, TimeDistributed,Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
# Desired date range
start_date_str = "02/01/2018"  # 2nd January 2003 (DD-MM-YYYY)
end_date_str   = "28/02/2025"  # 28th February 2025 (DD-MM-YYYY)
start_date_filter = pd.to_datetime(start_date_str, dayfirst=True)
end_date_filter   = pd.to_datetime(end_date_str, dayfirst=True)
raw_data_dir = "./data"           # Folder with raw CSV files (e.g., AAPL.csv, SPY.csv)
filtered_data_dir = "./filtered_data_lstmgru"

In [3]:


os.makedirs(filtered_data_dir, exist_ok=True)

def calculate_tema(series, window):
    """
    Calculate the Triple Exponential Moving Average (TEMA)
    using the formula: TEMA = 3*EMA1 - 3*EMA2 + EMA3
    where EMA1 = EMA(series, window)
          EMA2 = EMA(EMA1, window)
          EMA3 = EMA(EMA2, window)
    """
    ema1 = ta.trend.EMAIndicator(close=series, window=window, fillna=False).ema_indicator()
    ema2 = ta.trend.EMAIndicator(close=ema1, window=window, fillna=False).ema_indicator()
    ema3 = ta.trend.EMAIndicator(close=ema2, window=window, fillna=False).ema_indicator()
    tema = 3 * ema1 - 3 * ema2 + ema3
    return tema

def process_csv(file_path, filename):
    try:
        # Specific parsing based on filename
        if filename.upper() == "SPY.CSV":
            # SPY.csv is in DD-MM-YYYY format
            df = pd.read_csv(file_path, parse_dates=["Date"], dayfirst=True)
        else:
            # Other files are in YYYY-MM-DD format
            df = pd.read_csv(file_path, parse_dates=["Date"], dayfirst=False)

        # Drop rows where Date could not be parsed
        df.dropna(subset=["Date"], inplace=True)

        # Ensure Date column is datetime and remove timezone info
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce", utc=True).dt.tz_localize(None)

        # Verify we have the necessary columns
        required_columns = ["Close", "High", "Low", "Open", "Volume"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Warning: {filename} is missing columns: {missing_columns}")
            return None

        # Print min and max dates for debugging
        print(f"{filename} - Date Range: {df['Date'].min()} to {df['Date'].max()}")

        # Filter for dates between start and end dates
        df_filtered = df[
            (df["Date"] >= start_date_filter) & 
            (df["Date"] <= end_date_filter)
        ].copy()
        
        if df_filtered.empty:
            print(f"Warning: {filename} has no data within the specified date range.")
            return None

        # Retain only essential columns
        df_filtered = df_filtered[["Date", "Open", "High", "Low", "Close", "Volume"]]
        
        # --- Calculate Standard Technical Indicators ---
        for window in [14, 26, 50, 100, 200]:
            df_filtered[f"SMA_{window}"] = ta.trend.SMAIndicator(close=df_filtered["Close"], window=window, fillna=False).sma_indicator()
            df_filtered[f"EMA_{window}"] = ta.trend.EMAIndicator(close=df_filtered["Close"], window=window, fillna=False).ema_indicator()
            df_filtered[f"TEMA_{window}"] = calculate_tema(df_filtered["Close"], window)
        
        # Bollinger Bands (20-day window, std dev=2)
        bb_indicator = ta.volatility.BollingerBands(close=df_filtered["Close"], window=20, window_dev=2, fillna=False)
        df_filtered["BB_Hband"] = bb_indicator.bollinger_hband()
        df_filtered["BB_Mband"] = bb_indicator.bollinger_mavg()
        df_filtered["BB_Lband"] = bb_indicator.bollinger_lband()
        
        # RSI (14-day)
        df_filtered["RSI_14"] = ta.momentum.RSIIndicator(close=df_filtered["Close"], window=14, fillna=False).rsi()
        
        # MACD: using default parameters (fast=12, slow=26, signal=9)
        macd_indicator = ta.trend.MACD(close=df_filtered["Close"], window_slow=26, window_fast=12, window_sign=9, fillna=False)
        df_filtered["MACD"] = macd_indicator.macd()
        df_filtered["MACD_Signal"] = macd_indicator.macd_signal()
        df_filtered["MACD_Hist"] = macd_indicator.macd_diff()
        
        # Derived feature: Mean_HL as the average of High and Low
        df_filtered["Mean_HL"] = (df_filtered["High"] + df_filtered["Low"]) / 2.0
        
        # --- Calculate Extra Relative & Trend-based Indicators ---
        # Relative Momentum (RMom) for a 14-day window
        df_filtered["RMom_14"] = df_filtered["Close"] / df_filtered["Close"].shift(14)
        
        # MomTEMA: ratio of current TEMA to its previous value (offset = 1 day)
        for window in [14, 26, 50, 100, 200]:
            df_filtered[f"MomTEMA_{window}_ofs1"] = df_filtered[f"TEMA_{window}"] / df_filtered[f"TEMA_{window}"].shift(1)
        
        # RCTEMA: ratio of current Close to TEMA for each window
        for window in [14, 26, 50, 100, 200]:
            df_filtered[f"RCTEMA_{window}"] = df_filtered["Close"] / df_filtered[f"TEMA_{window}"]
        
        # MomEMA: ratio of current EMA to its previous value (offset = 1 day)
        for window in [14, 26, 50, 100, 200]:
            df_filtered[f"MomEMA_{window}_ofs1"] = df_filtered[f"EMA_{window}"] / df_filtered[f"EMA_{window}"].shift(1)
        
        # Ratio Indicators for a chosen fast/slow pair (example: 14 vs. 50)
        df_filtered["RTEMA_TEMA_14_50"] = df_filtered["TEMA_14"] / df_filtered["TEMA_50"]
        df_filtered["REMA_EMA_14_50"] = df_filtered["EMA_14"] / df_filtered["EMA_50"]
        df_filtered["RSMA_SMA_14_50"] = df_filtered["SMA_14"] / df_filtered["SMA_50"]
        
        # Relative Volume to SMA: compares current volume to its 20-day SMA
        df_filtered["RVolSMA_20"] = df_filtered["Volume"] / df_filtered["Volume"].rolling(window=20).mean()
        
        # Drop rows with NaN values from indicator calculations
        df_filtered.dropna(inplace=True)
        
        # --- Convert all dates to ISO format (YYYY-MM-DD) ---
        df_filtered["Date"] = df_filtered["Date"].dt.strftime("%Y-%m-%d")
        
        return df_filtered
    
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return None

# Main processing loop
for filename in os.listdir(raw_data_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(raw_data_dir, filename)
        
        # Process the file
        processed_df = process_csv(file_path, filename)
        
        # Save if processing was successful
        if processed_df is not None:
            output_path = os.path.join(filtered_data_dir, filename)
            processed_df.to_csv(output_path, index=False)
            print(f"Processed {filename} and saved to {output_path}")

print("Processing complete.")


AAPL.csv - Date Range: 1980-12-12 05:00:00 to 2025-03-03 05:00:00
Processed AAPL.csv and saved to ./filtered_data_lstmgru\AAPL.csv
ABBV.csv - Date Range: 2013-01-02 05:00:00 to 2025-03-03 05:00:00
Processed ABBV.csv and saved to ./filtered_data_lstmgru\ABBV.csv
ADBE.csv - Date Range: 1986-08-13 04:00:00 to 2025-03-03 05:00:00
Processed ADBE.csv and saved to ./filtered_data_lstmgru\ADBE.csv
AMD.csv - Date Range: 1980-03-17 05:00:00 to 2025-03-03 05:00:00
Processed AMD.csv and saved to ./filtered_data_lstmgru\AMD.csv
AMT.csv - Date Range: 1998-02-27 05:00:00 to 2025-03-03 05:00:00
Processed AMT.csv and saved to ./filtered_data_lstmgru\AMT.csv
AMZN.csv - Date Range: 1997-05-15 04:00:00 to 2025-03-03 05:00:00
Processed AMZN.csv and saved to ./filtered_data_lstmgru\AMZN.csv
BA.csv - Date Range: 1962-01-02 05:00:00 to 2025-03-03 05:00:00
Processed BA.csv and saved to ./filtered_data_lstmgru\BA.csv
BAC.csv - Date Range: 1973-02-21 05:00:00 to 2025-03-03 05:00:00
Processed BAC.csv and saved to

In [3]:
# =======================================================
# Part 2: Load Filtered Data and Feature Engineering for Modeling
# =======================================================

def load_csv_data(filepath):
    """Load CSV file and parse the Date column appropriately."""
    df = pd.read_csv(filepath)
    # Use the correct format: ISO format (YYYY-MM-DD)
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
    df.sort_values("Date", inplace=True)
    return df



def get_model_features(df):
    """
    Return a DataFrame with a fixed set of features.
    Required features:
      - Base: Open, High, Low, Close, Volume
      - Indicators: RSI_14,
                    SMA_14, SMA_26, SMA_50, SMA_100, SMA_200,
                    EMA_14, EMA_26, EMA_50, EMA_100, EMA_200,
                    Bollinger Bands: BB_Hband, BB_Mband, BB_Lband,
                    Derived: Mean_HL
    If any feature is missing, fill with 0.
    """
    desired_features = [
        "Open", "High", "Low", "Close", "Volume",
        "RSI_14",
        "SMA_14", "SMA_26", "SMA_50", "SMA_100", "SMA_200",
        "EMA_14", "EMA_26", "EMA_50", "EMA_100", "EMA_200",
        "BB_Hband", "BB_Mband", "BB_Lband",
        "Mean_HL", "MACD", "MACD_Signal", "MACD_Hist",
        "RMom_14", "MomTEMA_14_ofs1", "MomTEMA_26_ofs1", "MomTEMA_50_ofs1", "MomTEMA_100_ofs1", "MomTEMA_200_ofs1",
        "RCTEMA_14", "RCTEMA_26", "RCTEMA_50", "RCTEMA_100", "RCTEMA_200",
        "MomEMA_14_ofs1", "MomEMA_26_ofs1", "MomEMA_50_ofs1", "MomEMA_100_ofs1", "MomEMA_200_ofs1",
        "RTEMA_TEMA_14_50", "REMA_EMA_14_50", "RSMA_SMA_14_50",
        "RVolSMA_20"
    ]
    for col in desired_features:
        if col not in df.columns:
            df[col] = 0.0
    return df[["Date"] + desired_features].copy()



In [4]:
# =======================================================
# Part 3: Sequence Generation and Global Scaling
# =======================================================
# We use a sliding window of 20 time steps.
sequence_length = 20

def prepare_sequences(df, seq_length):
    """
    Generate sequences from the DataFrame (sorted by Date) that contains the fixed set of features.
    Each sequence has shape (seq_length, num_features) and the target is the next day's Close price.
    Returns X, y, and seq_dates (as datetime64[ns]).
    """
    dates = df["Date"].values
    df_features = df.drop(columns=["Date"])
    data_array = df_features.values
    X, y, seq_dates = [], [], []
    target_index = df_features.columns.get_loc("Close")
    for i in range(seq_length, len(data_array)):
        X.append(data_array[i-seq_length:i])
        y.append(data_array[i, target_index])
        seq_dates.append(dates[i])
    # Ensure the dates are returned as datetime64[ns]
    seq_dates = np.array(seq_dates, dtype='datetime64[ns]')
    return np.array(X), np.array(y), seq_dates

X_list, y_list, dates_list, ticker_list = [], [], [], []
all_tickers = []

for filename in os.listdir(filtered_data_dir):
    if filename.endswith(".csv"):
        ticker = filename.split(".csv")[0]
        all_tickers.append(ticker)
        filepath = os.path.join(filtered_data_dir, filename)
        df_raw = load_csv_data(filepath)
        df_feat = get_model_features(df_raw)
        if len(df_feat) > sequence_length:
            X, y, seq_dates = prepare_sequences(df_feat, sequence_length)
            X_list.append(X)
            y_list.append(y)
            dates_list.append(seq_dates)
            ticker_list.extend([ticker] * len(y))

# Concatenate sequences from all tickers.
X_all = np.concatenate(X_list, axis=0)
y_all = np.concatenate(y_list, axis=0)
dates_all = np.concatenate(dates_list, axis=0)  # Now with dtype datetime64[ns]
num_features = X_all.shape[2]

# Fit a global RobustScaler on all feature data.
scaler = RobustScaler()
all_data = X_all.reshape(-1, num_features)
scaler.fit(all_data)

def scale_sequences(X, scaler):
    """Scale each sequence using the fitted scaler."""
    return np.array([scaler.transform(seq) for seq in X])

X_all_scaled = scale_sequences(X_all, scaler)



In [5]:


# =======================================================
# Assumed Preprocessing (should be executed before this snippet)
# =======================================================
# For example:
# X_all_scaled = ...  # shape: (n_samples, sequence_length, num_features)
# y_all = ...         # shape: (n_samples,)
# sequence_length = 20
# num_features = X_all_scaled.shape[2]

# =======================================================
# Part 4: Train-Test Split (Random 80/20 Split)
# =======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_all_scaled, y_all, test_size=0.1, random_state=42, shuffle=True
)
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

# =======================================================
# Setup Distribution Strategy for Parallel Processing
# =======================================================
strategy = tf.distribute.MirroredStrategy()
print("Number of devices: ", strategy.num_replicas_in_sync)

# =======================================================
# Custom Callback for Epoch Timing and Logging Analytics
# =======================================================
class TimeHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.epoch_times = []
        self.train_start_time = time.time()
        print("Training started...")
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time.time()
    def on_epoch_end(self, epoch, logs=None):
        epoch_time = time.time() - self.epoch_start_time
        self.epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} finished in {epoch_time:.2f} seconds. "
              f"Loss: {logs.get('loss'):.4f}, Val Loss: {logs.get('val_loss'):.4f}")
    def on_train_end(self, logs=None):
        total_time = time.time() - self.train_start_time
        print(f"Training completed in {total_time:.2f} seconds over {len(self.epoch_times)} epochs.")
        avg_epoch_time = np.mean(self.epoch_times)
        print(f"Average time per epoch: {avg_epoch_time:.2f} seconds.")

time_callback = TimeHistory()

Training samples: 35076
Testing samples: 3898
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices:  1


In [6]:
from keras_tuner import HyperParameters, RandomSearch
import keras_tuner as kt

def build_model(hp):
    sequence_length = 20
    num_features = 43

    inputs = Input(shape=(sequence_length, num_features))
    
    # Tune the Dense layer size
    dense_units = hp.Choice("dense_units", [32, 64, 128])
    x = TimeDistributed(Dense(dense_units, activation='selu'))(inputs)
    
    # Tune number of GRU units
    short_term_steps = hp.Int("short_term_steps", min_value=3, max_value=10, step=1)
    short_term = Lambda(lambda x: x[:, -short_term_steps:, :])(x)

    short_units_1 = hp.Choice("short_units_1", [32, 64, 128])
    short_units_2 = hp.Choice("short_units_2", [16, 32, 64])
    short_branch = GRU(short_units_1, return_sequences=True, recurrent_dropout=0.2)(short_term)
    short_branch = GRU(short_units_2, recurrent_dropout=0.1)(short_branch)

    # Tune number of LSTM units in long-term branch
    long_units_1 = hp.Choice("long_units_1", [64, 128])
    long_units_2 = hp.Choice("long_units_2", [32, 64])
    long_units_3 = hp.Choice("long_units_3", [16, 32])
    long_branch = LSTM(long_units_1, return_sequences=True, recurrent_dropout=0.2)(x)
    long_branch = LSTM(long_units_2, return_sequences=True, recurrent_dropout=0.1)(long_branch)
    long_branch = LSTM(long_units_3, recurrent_dropout=0.1)(long_branch)

    # Merge
    merged = Concatenate()([short_branch, long_branch])

    dense_final = hp.Choice("dense_final", [16, 32, 64])
    dense_out = Dense(dense_final, activation='selu')(merged)
    outputs = Dense(1)(dense_out)

    model = Model(inputs=inputs, outputs=outputs)

    # Learning rate tuning
    learning_rate = hp.Choice("lr", [1e-2, 1e-3, 5e-4, 1e-4])
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=Huber(delta=1.5),
        metrics=["mae"]
    )
    
    return model


In [7]:
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=20,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='lstm_gru_forecast'
)





In [None]:
early_stop = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

tuner.search(
    X_train, y_train,
    epochs=100,
    validation_split=0.2,
    batch_size=16,
    callbacks=[early_stop]
)



Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
128               |128               |dense_units
9                 |9                 |short_term_steps
128               |128               |short_units_1
64                |64                |short_units_2
128               |128               |long_units_1
64                |64                |long_units_2
32                |32                |long_units_3
32                |32                |dense_final
0.0005            |0.0005            |lr

Epoch 1/100
[1m 147/1754[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3:29[0m 130ms/step - loss: 267.2347 - mae: 178.9048