# In this notebook we will train the LSTM model

In [None]:
drop_seq_4h = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'bollinger_upper', 'bollinger_lower', 'MACD_line', 'MACD_signal', 'stoch_%D',
    'EMA_21', 'SMA_20',
    'bullish_scenario_1', 'bullish_scenario_2', 'bullish_scenario_3',
    'bullish_scenario_4', 'bullish_scenario_5', 'bullish_scenario_6',
    'bearish_scenario_1', 'bearish_scenario_2', 'bearish_scenario_3',
    'bearish_scenario_4', 'bearish_scenario_6',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'rsi_oversold', 'rsi_overbought', 'stoch_overbought',
    'stoch_oversold', 'cci_overbought', 'cci_oversold', 'near_upper_band',
    'near_lower_band', 'overbought_reversal', 'oversold_reversal',
    'above_sma20', 'above_sma50', 'ema7_above_ema21',
    'ema_cross_up', 'ema_cross_down', 'macd_cross_up', 'macd_cross_down',
    'macd_positive', 'momentum_alignment', 'macd_rising', 'obv_rising_24h',
    'trending_market', 'trend_alignment',
    'support_level', 'resistance_level', 'volatility_regime',
    'close_daily', 'rsi_daily'
]

✅ DROP_COLS['4H']['LSTM']
✅ DROP_COLS['4H']['GRU']
✅ DROP_COLS['4H']['CNN']
✅ DROP_COLS['4H']['CNN_LSTM']
✅ DROP_COLS['4H']['TCN']


In [None]:
drop columns catnboost
drop_catboost_4h = [
    'open', 'high', 'low',
    'high_low', 'high_close', 'low_close', 'typical_price',
    'EMA_21', 'SMA_20', 'bollinger_upper', 'bollinger_lower',
    'MACD_line', 'MACD_signal', 'stoch_%D',
    'bullish_scenario_1', 'bullish_scenario_2', 'bullish_scenario_3',
    'bullish_scenario_4', 'bullish_scenario_5',
    'bearish_scenario_1', 'bearish_scenario_2', 'bearish_scenario_3', 
    'bearish_scenario_6',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'near_upper_band', 'near_lower_band',
    'rsi_oversold', 'rsi_overbought', 'stoch_overbought', 'stoch_oversold',
    'cci_overbought', 'cci_oversold', 'overbought_reversal', 'oversold_reversal',
    'ema_cross_up', 'ema_cross_down', 'macd_cross_up', 'macd_cross_down',
    'trending_market', 'trend_alignment', 'momentum_alignment',
    'ema7_above_ema21', 'obv_rising_24h', 'above_sma20', 'above_sma50',
    'macd_positive', 'macd_rising'
]


In [None]:
# drop columns of xgboost + LightGBM are the same

In [2]:
# =============================================================
#  IMPROVED LSTM HYPERPARAMETER TUNER (precision-weighted Fβ=0.5)
# =============================================================
import numpy as np
import pandas as pd
import time
import warnings
from pathlib import Path
import json
import gc
from datetime import datetime
import itertools

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.regularizers import l1_l2

# Sklearn for metrics and preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

warnings.filterwarnings("ignore")
np.random.seed(42)
tf.random.set_seed(42)

# GPU setup
print("🔧 GPU Setup:")
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✅ Found {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"❌ GPU setup error: {e}")
else:
    print("⚠️ No GPU found, using CPU")

# ──────────────────────────────────────────────────────────────
# 1) CONFIG  – EDIT HERE
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL = "timestamp"
TARGET_COL = "target"

START_DATE = "2016-01-01"
TEST_FRAC = 0.20
VAL_FRAC = 0.15

# Sequence parameters
SEQUENCE_LENGTH = 24  # 24 * 4h = 4 days of history

DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
            'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
            'vol_spike_1_5x', 'near_upper_band', 'near_lower_band',
            'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
            'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
            'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment', 'obv_rising_24h',
            'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']

# Bounded columns that should use MinMax scaling (0-1 range)
BOUNDED_COLS = ['rsi', 'stoch_%K', 'bb_position', 'williams_%R']  # Add your bounded features

# IMPROVED Hyperparameter search space
PARAM_GRID = {
    'lstm_units': [32, 64, 128],
    'num_layers': [1, 2, 3],
    'dropout_rate': [0.2, 0.3, 0.5],
    'learning_rate': [0.001, 0.0001, 0.00001],
    'batch_size': [32, 64, 128],
    'optimizer': ['adam', 'rmsprop'],
    'scaler_type': ['standard', 'robust', 'minmax', 'mixed'],  # Added minmax and mixed
    'l1_reg': [0.0, 0.001, 0.01],
    'l2_reg': [0.0, 0.001, 0.01],
    'threshold': [0.3, 0.4, 0.5, 0.6, 0.7]  # Added threshold as hyperparameter
}

# Search strategy
MAX_TRIALS = 50
EARLY_STOPPING_PATIENCE = 8  # Reduced for faster trials
REDUCE_LR_PATIENCE = 4
MAX_EPOCHS = 80  # Reduced for faster search

# Results save path
RESULTS_PATH = Path("results/lstm_hyperparameter_results.json")
RESULTS_PATH.parent.mkdir(exist_ok=True)

# ──────────────────────────────────────────────────────────────
# 2) IMPROVED DATA PREPROCESSING
# ──────────────────────────────────────────────────────────────
def load_and_prepare_data():
    """Load and prepare the dataset"""
    if not CSV_FILE.exists():
        raise FileNotFoundError(f"❌ File not found: {CSV_FILE}")
    
    print("📊 Loading data...")
    df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COL])
    df = df.set_index(TIME_COL).sort_index()
    df = df.loc[START_DATE:].copy()
    
    if TARGET_COL not in df.columns:
        raise KeyError(f"❌ '{TARGET_COL}' column missing!")
    
    # Remove specified columns
    features_to_drop = [c for c in DROP_COLS if c in df.columns] + [TARGET_COL]
    X = df.drop(columns=features_to_drop, errors="ignore")
    y = df[TARGET_COL]
    
    print(f"Dataset shape: {X.shape}")
    print(f"Features: {len(X.columns)}")
    print(f"Target distribution: {y.value_counts().to_dict()}")
    
    # Check for missing values
    if X.isnull().sum().sum() > 0:
        print("⚠️ Warning: Missing values detected - forward filling")
        X = X.fillna(method='ffill').fillna(method='bfill')
    
    return X, y

def create_sequences(X, y, sequence_length):
    """Create sequences for LSTM training"""
    X_seq, y_seq = [], []
    
    for i in range(sequence_length, len(X)):
        X_seq.append(X.iloc[i-sequence_length:i].values)
        y_seq.append(y.iloc[i])
    
    return np.array(X_seq), np.array(y_seq)

def get_mixed_scaler(X_train, bounded_cols):
    """Create mixed scaler: MinMax for bounded, Standard for others"""
    class MixedScaler:
        def __init__(self, bounded_cols):
            self.bounded_cols = bounded_cols
            self.standard_scaler = StandardScaler()
            self.minmax_scaler = MinMaxScaler()
            self.feature_names = None
            
        def fit(self, X):
            self.feature_names = X.columns if hasattr(X, 'columns') else range(X.shape[1])
            bounded_mask = [col in self.bounded_cols for col in self.feature_names]
            
            if np.any(bounded_mask):
                self.minmax_scaler.fit(X.iloc[:, bounded_mask] if hasattr(X, 'iloc') else X[:, bounded_mask])
            if np.any(~np.array(bounded_mask)):
                self.standard_scaler.fit(X.iloc[:, ~np.array(bounded_mask)] if hasattr(X, 'iloc') else X[:, ~np.array(bounded_mask)])
            return self
            
        def transform(self, X):
            result = np.zeros_like(X)
            bounded_mask = [col in self.bounded_cols for col in self.feature_names]
            
            if np.any(bounded_mask):
                bounded_indices = np.where(bounded_mask)[0]
                result[:, bounded_indices] = self.minmax_scaler.transform(
                    X.iloc[:, bounded_mask] if hasattr(X, 'iloc') else X[:, bounded_mask]
                )
            if np.any(~np.array(bounded_mask)):
                standard_indices = np.where(~np.array(bounded_mask))[0]
                result[:, standard_indices] = self.standard_scaler.transform(
                    X.iloc[:, ~np.array(bounded_mask)] if hasattr(X, 'iloc') else X[:, ~np.array(bounded_mask)]
                )
            return result
    
    return MixedScaler(bounded_cols)

def prepare_lstm_data(X, y, sequence_length, test_frac, val_frac, scaler_type='standard'):
    """Prepare data for LSTM training with improved scaling"""
    
    # Time-based splits
    test_split = int(len(X) * (1 - test_frac))
    train_val_split = int(test_split * (1 - val_frac))
    
    X_train_val = X.iloc[:test_split]
    X_test = X.iloc[test_split:]
    y_train_val = y.iloc[:test_split]
    y_test = y.iloc[test_split:]
    
    X_train = X_train_val.iloc[:train_val_split]
    X_val = X_train_val.iloc[train_val_split:]
    y_train = y_train_val.iloc[:train_val_split]
    y_val = y_train_val.iloc[train_val_split:]
    
    # Choose scaler - TRAIN ONLY FIT
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'robust':
        scaler = RobustScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:  # mixed
        scaler = get_mixed_scaler(X_train, BOUNDED_COLS)
    
    # Fit ONLY on training data
    scaler.fit(X_train)
    
    X_train_scaled = pd.DataFrame(
        scaler.transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_val_scaled = pd.DataFrame(
        scaler.transform(X_val),
        columns=X_val.columns,
        index=X_val.index
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    # Create sequences
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, sequence_length)
    X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val, sequence_length)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, sequence_length)
    
    return (X_train_seq, y_train_seq), (X_val_seq, y_val_seq), (X_test_seq, y_test_seq), scaler

# ──────────────────────────────────────────────────────────────
# 3) IMPROVED MODEL AND CALLBACKS
# ──────────────────────────────────────────────────────────────
def f_beta_score(y_true, y_pred, beta=0.5, threshold=0.5):
    """Calculate F-beta score with custom threshold"""
    y_pred_binary = (y_pred > threshold).astype(int)
    p = precision_score(y_true, y_pred_binary, zero_division=0)
    r = recall_score(y_true, y_pred_binary, zero_division=0)
    if p + r == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

class FBetaEarlyStopping(Callback):
    """Custom callback to stop training based on F-beta score"""
    def __init__(self, validation_data, threshold=0.5, patience=10, min_delta=0.001):
        super().__init__()
        self.validation_data = validation_data
        self.threshold = threshold
        self.patience = patience
        self.min_delta = min_delta
        self.wait = 0
        self.best_score = 0
        self.best_weights = None
        
    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        y_pred = self.model.predict(X_val, verbose=0)
        current_score = f_beta_score(y_val, y_pred.flatten(), threshold=self.threshold)
        
        if current_score > self.best_score + self.min_delta:
            self.best_score = current_score
            self.best_weights = self.model.get_weights()
            self.wait = 0
        else:
            self.wait += 1
            
        if self.wait >= self.patience:
            self.model.set_weights(self.best_weights)
            self.model.stop_training = True

def build_lstm_model(input_shape, params):
    """Build LSTM model with IMPROVED architecture"""
    model = Sequential()
    
    # First LSTM layer - REMOVED recurrent_dropout for GPU efficiency
    if params['num_layers'] == 1:
        model.add(LSTM(
            params['lstm_units'],
            input_shape=input_shape,
            return_sequences=False,
            kernel_regularizer=l1_l2(l1=params['l1_reg'], l2=params['l2_reg'])
        ))
    else:
        model.add(LSTM(
            params['lstm_units'],
            input_shape=input_shape,
            return_sequences=True,
            kernel_regularizer=l1_l2(l1=params['l1_reg'], l2=params['l2_reg'])
        ))
        
        # Additional LSTM layers
        for i in range(1, params['num_layers']):
            return_seq = (i < params['num_layers'] - 1)
            model.add(LSTM(
                params['lstm_units'],
                return_sequences=return_seq,
                kernel_regularizer=l1_l2(l1=params['l1_reg'], l2=params['l2_reg'])
            ))
    
    # Dropout layer AFTER LSTM
    model.add(Dropout(params['dropout_rate']))
    
    # Batch normalization
    model.add(BatchNormalization())
    
    # Dense layers
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(params['dropout_rate']))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    if params['optimizer'] == 'adam':
        optimizer = Adam(learning_rate=params['learning_rate'])
    else:  # rmsprop
        optimizer = RMSprop(learning_rate=params['learning_rate'])
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def evaluate_model(model, X_test, y_test, threshold=0.5):
    """Evaluate model with custom threshold"""
    y_pred_prob = model.predict(X_test, verbose=0)
    y_pred = (y_pred_prob > threshold).astype(int).flatten()
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f_beta_0_5': f_beta_score(y_test, y_pred_prob.flatten(), threshold=threshold),
        'roc_auc': roc_auc_score(y_test, y_pred_prob),
        'threshold': threshold
    }
    
    return metrics

# ──────────────────────────────────────────────────────────────
# 4) IMPROVED SEARCH STRATEGY
# ──────────────────────────────────────────────────────────────
def efficient_param_generator(param_grid, max_trials):
    """Memory-efficient parameter generation using itertools"""
    # Get all parameter names and values
    param_names = list(param_grid.keys())
    param_values = [param_grid[name] for name in param_names]
    
    # Calculate total combinations
    total_combinations = np.prod([len(values) for values in param_values])
    print(f"Total possible combinations: {total_combinations:,}")
    
    if total_combinations <= max_trials:
        # Generate all combinations if small enough
        for combination in itertools.product(*param_values):
            yield dict(zip(param_names, combination))
    else:
        # Randomly sample combinations
        sampled_indices = np.random.choice(total_combinations, max_trials, replace=False)
        sampled_indices.sort()  # Sort for better memory access patterns
        
        current_index = 0
        for i, combination in enumerate(itertools.product(*param_values)):
            if current_index < len(sampled_indices) and i == sampled_indices[current_index]:
                yield dict(zip(param_names, combination))
                current_index += 1
                if current_index >= len(sampled_indices):
                    break

def run_hyperparameter_search():
    """IMPROVED hyperparameter search function"""
    print("🚀 Starting IMPROVED LSTM Hyperparameter Search")
    print("="*60)
    
    # Load and prepare data
    X, y = load_and_prepare_data()
    
    results = []
    best_score = 0
    best_params = None
    
    trial = 0
    for params in efficient_param_generator(PARAM_GRID, MAX_TRIALS):
        trial += 1
        print(f"\n📊 Trial {trial}/{MAX_TRIALS}")
        print(f"Parameters: {params}")
        
        try:
            # Prepare data with current scaler
            (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler = prepare_lstm_data(
                X, y, SEQUENCE_LENGTH, TEST_FRAC, VAL_FRAC, params['scaler_type']
            )
            
            # Build model
            input_shape = (X_train.shape[1], X_train.shape[2])
            model = build_lstm_model(input_shape, params)
            
            # IMPROVED Callbacks - stop on F-beta instead of loss
            callbacks = [
                FBetaEarlyStopping(
                    validation_data=(X_val, y_val),
                    threshold=params['threshold'],
                    patience=EARLY_STOPPING_PATIENCE
                ),
                ReduceLROnPlateau(
                    monitor='val_accuracy',  # Changed from val_loss
                    factor=0.5,
                    patience=REDUCE_LR_PATIENCE,
                    min_lr=1e-7,
                    verbose=0
                )
            ]
            
            # Train model
            start_time = time.time()
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=MAX_EPOCHS,
                batch_size=params['batch_size'],
                callbacks=callbacks,
                verbose=0
            )
            training_time = time.time() - start_time
            
            # Evaluate model with custom threshold
            val_metrics = evaluate_model(model, X_val, y_val, params['threshold'])
            test_metrics = evaluate_model(model, X_test, y_test, params['threshold'])
            
            # Store results
            result = {
                'trial': trial,
                'params': params,
                'val_metrics': val_metrics,
                'test_metrics': test_metrics,
                'training_time': training_time,
                'epochs_trained': len(history.history['loss']),
                'final_train_loss': float(history.history['loss'][-1]),
                'final_val_loss': float(history.history['val_loss'][-1])
            }
            
            results.append(result)
            
            # Check if this is the best model
            current_score = val_metrics['f_beta_0_5']
            if current_score > best_score:
                best_score = current_score
                best_params = params
                print(f"🌟 New best F-beta score: {current_score:.4f}")
            
            print(f"Val F-beta: {val_metrics['f_beta_0_5']:.4f}, "
                  f"Test F-beta: {test_metrics['f_beta_0_5']:.4f}, "
                  f"Time: {training_time:.1f}s")
            
            # IMPROVED Memory cleanup
            del model, history
            tf.keras.backend.clear_session()
            gc.collect()
            
        except Exception as e:
            print(f"❌ Trial failed: {str(e)}")
            # Clean up even on failure
            tf.keras.backend.clear_session()
            gc.collect()
            continue
    
    return results, best_params, best_score

# ──────────────────────────────────────────────────────────────
# 5) MAIN EXECUTION
# ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
    start_time = time.time()
    
    print("🔍 IMPROVED LSTM Hyperparameter Tuning")
    print("="*60)
    total_combinations = np.prod([len(v) for v in PARAM_GRID.values()])
    print(f"Search space size: {total_combinations:,}")
    print(f"Max trials: {MAX_TRIALS}")
    print(f"Sequence length: {SEQUENCE_LENGTH}")
    print(f"Target metric: F-beta (β=0.5) with threshold optimization")
    print(f"Improvements: Mixed scaling, F-beta early stopping, threshold tuning")
    
    # Run hyperparameter search
    results, best_params, best_score = run_hyperparameter_search()
    
    total_time = time.time() - start_time
    
    # Display results
    print("\n" + "="*60)
    print("🏆 IMPROVED HYPERPARAMETER SEARCH RESULTS")
    print("="*60)
    
    if results:
        print(f"Completed trials: {len(results)}")
        print(f"Total time: {total_time/60:.1f} minutes")
        print(f"Average time per trial: {total_time/len(results):.1f} seconds")
        
        print(f"\n🌟 BEST PARAMETERS (F-beta = {best_score:.4f}):")
        for param, value in best_params.items():
            print(f"   {param:<15}: {value}")
        
        # Sort results by validation F-beta score
        results_sorted = sorted(results, key=lambda x: x['val_metrics']['f_beta_0_5'], reverse=True)
        
        print(f"\n📊 TOP 5 MODELS:")
        print("Rank | Val F-beta | Test F-beta | Threshold | Time(s) | Scaler | Params")
        print("-" * 80)
        
        for i, result in enumerate(results_sorted[:5]):
            val_score = result['val_metrics']['f_beta_0_5']
            test_score = result['test_metrics']['f_beta_0_5']
            threshold = result['params']['threshold']
            time_taken = result['training_time']
            scaler = result['params']['scaler_type']
            params_str = f"units={result['params']['lstm_units']}, layers={result['params']['num_layers']}"
            print(f"{i+1:>4} | {val_score:>10.4f} | {test_score:>11.4f} | {threshold:>9.1f} | {time_taken:>7.1f} | {scaler:>6} | {params_str}")
        
        # Save results
        save_data = {
            'search_config': {
                'param_grid': PARAM_GRID,
                'max_trials': MAX_TRIALS,
                'sequence_length': SEQUENCE_LENGTH,
                'test_frac': TEST_FRAC,
                'val_frac': VAL_FRAC,
                'bounded_cols': BOUNDED_COLS,
                'improvements': [
                    'Mixed/MinMax scaling for bounded features',
                    'F-beta early stopping instead of loss',
                    'Threshold as hyperparameter',
                    'Improved memory management',
                    'Removed recurrent_dropout for GPU efficiency'
                ]
            },
            'best_params': best_params,
            'best_score': best_score,
            'all_results': results,
            'search_time': total_time,
            'timestamp': datetime.now().isoformat()
        }
        
        with open(RESULTS_PATH, 'w') as f:
            json.dump(save_data, f, indent=2)
        
        print(f"\n💾 Results saved to: {RESULTS_PATH}")
        
    else:
        print("❌ No successful trials completed!")
    
    print("\n✅ IMPROVED Hyperparameter search completed!")

ModuleNotFoundError: No module named 'tensorflow'