IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, Bidirectional, LSTM,
                                     Dropout, Dense, Add, GlobalAveragePooling1D, Attention)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

DATA PREPRATION

In [None]:
def prepare_and_scale_data(filepath):
    """
    Loads, pre-processes, and scales data with the train-val-test split.
    """
    df = pd.read_csv(filepath)
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    features_to_use = [
        'open', 'high', 'low', 'close', 'volume', 'sma5', 'sma20', 'ema5', 'ema20',
        'upperband', 'middleband', 'lowerband', 'macd1226', 'MOM10', 'RSI14',
        'slowk', 'slowd', 'WILLR', 'ATR'
    ]
    df = df[features_to_use].astype('float32')
    df.dropna(inplace=True)

    df['TomorrowClose'] = df['close'].shift(-1)
    df['Target'] = (df['TomorrowClose'] > df['close']).astype(int)
    df.dropna(inplace=True)

    features = df.drop(['TomorrowClose', 'Target'], axis=1)
    target = df['Target']

    # Splitting the data
    train_size = int(len(features) * 0.8) 
    val_size = int(len(features) * 0.1) # 10% for validation

    train_features = features[:train_size]
    val_features = features[train_size : train_size + val_size]
    test_features = features[train_size + val_size :]

    train_target = target[:train_size]
    val_target = target[train_size : train_size + val_size]
    test_target = target[train_size + val_size :]

    # Scaling after splitting
    scaler = MinMaxScaler()
    train_features_scaled = scaler.fit_transform(train_features)
    val_features_scaled = scaler.transform(val_features)
    test_features_scaled = scaler.transform(test_features)

    return (train_features_scaled, train_target.values,
            val_features_scaled, val_target.values,
            test_features_scaled, test_target.values,
            scaler)

DATA PIPELINE; SUITABLE INPUT

In [None]:
def create_tf_dataset(features, targets, sequence_length, batch_size):
    dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=features,
        targets=targets,
        sequence_length=sequence_length,
        sequence_stride=1,
        sampling_rate=1,
        batch_size=batch_size,
        shuffle=False
    )
    return dataset.prefetch(tf.data.AUTOTUNE)

BUILDING THE CNN-BiLSTM-Attention MODEL

In [None]:
def build_efficient_model(input_shape, dropout=0.3):
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=64, kernel_size=7, padding="same", activation="relu")(inputs)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(dropout)(x)
    lstm_out = Bidirectional(LSTM(units=128, return_sequences=True))(x)
    attention_out = Attention()([lstm_out, lstm_out])
    x = GlobalAveragePooling1D()(attention_out)
    x = Dense(64, activation="relu")(x)
    x = Dropout(dropout)(x)
    outputs = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    return model

EXECUTION

In [None]:

# --- 4. MAIN EXECUTION ---
if __name__ == '__main__':
    # Configuration
    FILEPATH = '/Users/rajataggarwal/Downloads/NIFTY100_5mindata/DMART_with_indicators_.csv'
    SEQUENCE_LENGTH = 75 * 5
    BATCH_SIZE = 64
    MODEL_SAVE_PATH = 'best_stock_predictor.keras'
    SCALER_SAVE_PATH = 'data_scaler.joblib'

    # Load and process data
    print("Loading, scaling, and splitting data...")
    train_X, train_y, val_X, val_y, test_X, test_y, scaler = prepare_and_scale_data(FILEPATH)

    # Create high-performance tf.data pipelines
    train_dataset = create_tf_dataset(train_X, train_y, SEQUENCE_LENGTH, BATCH_SIZE)
    val_dataset = create_tf_dataset(val_X, val_y, SEQUENCE_LENGTH, BATCH_SIZE)
    test_dataset = create_tf_dataset(test_X, test_y, SEQUENCE_LENGTH, BATCH_SIZE)
    print("TF.data pipelines created.")

    # Build the model
    input_shape = (SEQUENCE_LENGTH, train_X.shape[1])
    model = build_efficient_model(input_shape)
    model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"]) #Binary crossentropy loss function since model itself is about classifying trends.
    model.summary()

    # --- ADDED: ModelCheckpoint to save the best model ---
    callbacks = [
        EarlyStopping(patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6),
        ModelCheckpoint(filepath=MODEL_SAVE_PATH, monitor='val_loss', save_best_only=True)
    ]

    # Train the model
    print("\nStarting model training...")
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=50,
        callbacks=callbacks
    )
    print("Training finished.")

    # --- ADDED: Save the scaler for real-world prediction ---
    print(f"Saving the data scaler to '{SCALER_SAVE_PATH}'...")
    joblib.dump(scaler, SCALER_SAVE_PATH)
    print("Scaler saved.")

    # --- ADDED: Comprehensive Evaluation ---
    print("\nEvaluating model on the unseen test set...")
    
    # Load the best model saved by ModelCheckpoint for final evaluation
    best_model = tf.keras.models.load_model(MODEL_SAVE_PATH)
    y_pred_proba = best_model.predict(test_dataset)
    y_pred = (y_pred_proba > 0.5).astype(int)

    y_test_aligned = np.concatenate([y for x, y in test_dataset], axis=0)
    y_pred = y_pred[:len(y_test_aligned)]

    # Calculate and print all relevant metrics
    accuracy = accuracy_score(y_test_aligned, y_pred)
    precision = precision_score(y_test_aligned, y_pred)
    recall = recall_score(y_test_aligned, y_pred)
    f1 = f1_score(y_test_aligned, y_pred)
    roc_auc = roc_auc_score(y_test_aligned, y_pred_proba[:len(y_test_aligned)])

    print("\n--- Final Model Performance ---")
    print(f"Test Accuracy: {accuracy:.4f}") 
    #What it means: Overall, what percentage of predictions were correct?
    
    print(f"\nTest Precision: {precision:.4f}") 
    #What it means: Of all the times the model predicted 'UP', how often was it right? (Measures signal quality)

    print(f"\nTest Recall: {recall:.4f}")
    #What it means: Of all the times the market actually went 'UP', how many did the model catch? (Measures opportunity capture)

    print(f"\nTest F1-Score: {f1:.4f}")
    #What it means: A balanced score between Precision and Recall. 

    print(f"\nTest ROC AUC: {roc_auc:.4f}")
    # What it means: Measures the model's ability to distinguish between the 'UP' and 'DOWN' classes across all probability thresholds.
 

