In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import math
import os
import zipfile
from datetime import datetime, timedelta
# Set random seeds for reproducibility
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)


In [2]:

# 1. DATA LOADING AND PREPROCESSING
def load_stock_data(zip_file_path, ticker='AAPL', data_type='stocks'):
    """
    Load stock data from Kaggle Stock Market Dataset zip file
    
    Parameters:
    - zip_file_path: Path to the downloaded zip file
    - ticker: Stock symbol (default: Apple)
    - data_type: Either 'stocks' or 'etfs' (default: stocks)
    
    Returns:
    - DataFrame with stock data
    """
    print(f"Loading {ticker} {data_type} data from zip file...")
    
    # Check if the zip file exists
    if not os.path.exists(zip_file_path):
        print(f"Zip file not found at {zip_file_path}")
        return None
    
    # Construct the file path based on data type
    if data_type.lower() not in ['stocks', 'etfs']:
        print(f"Invalid data_type: {data_type}. Must be 'stocks' or 'etfs'")
        return None
    
    ticker_file = f"{data_type}/{ticker}.us.txt"
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as z:
            # Check if the ticker file exists in the zip
            if ticker_file not in z.namelist():
                print(f"No data for ticker {ticker} in the {data_type} folder")
                return None
            
            # Read the CSV data for the ticker
            with z.open(ticker_file) as f:
                df = pd.read_csv(f, parse_dates=['Date'])
    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    
    # Set Date as index
    df.set_index('Date', inplace=True)
    
    print(f"Loaded {len(df)} days of trading data for {ticker}")
    return df

def get_ticker_metadata(zip_file_path, ticker=None):
    """
    Get metadata for tickers from symbols_valid_meta.csv
    
    Parameters:
    - zip_file_path: Path to the downloaded zip file
    - ticker: Optional ticker to filter for (default: None, returns all)
    
    Returns:
    - DataFrame with ticker metadata
    """
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as z:
            meta_file = "symbols_valid_meta.csv"
            if meta_file not in z.namelist():
                print(f"Metadata file not found in zip")
                return None
                
            with z.open(meta_file) as f:
                meta_df = pd.read_csv(f)
                
            # If a specific ticker is requested, filter for it
            if ticker:
                meta_df = meta_df[meta_df['Symbol'] == ticker]
                if meta_df.empty:
                    print(f"No metadata found for ticker {ticker}")
                    return None
                    
            return meta_df
    except Exception as e:
        print(f"Error loading metadata: {e}")
        return None

def preprocess_data(df):
    """
    Preprocess the stock data for modeling
    
    Parameters:
    - df: DataFrame with stock data
    
    Returns:
    - Processed DataFrame
    """
    # Make a copy to avoid modifying the original
    data = df.copy()
    
    # Check for expected columns based on the Kaggle dataset structure
    expected_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    for col in expected_columns:
        if col not in data.columns:
            print(f"Warning: Expected column '{col}' not found in data")
    
    # Focus on the adjusted close price as our target (preferred over close for long-term analysis)
    if 'Adj Close' in data.columns:
        # Keep relevant columns
        cols_to_keep = [col for col in expected_columns if col in data.columns]
        data = data[cols_to_keep].copy()
    elif 'Close' in data.columns:
        # If Adj Close is not available, use Close
        print("Note: Using 'Close' as target since 'Adj Close' is not available")
        cols_to_keep = [col for col in expected_columns if col in data.columns and col != 'Adj Close']
        data = data[cols_to_keep].copy()
    else:
        raise ValueError("Neither 'Close' nor 'Adj Close' column found in data")
    
    # Check for and handle missing values
    if data.isnull().sum().sum() > 0:
        print(f"Found {data.isnull().sum().sum()} missing values. Filling with forward fill method.")
        data = data.fillna(method='ffill')
        # If there are still NAs at the beginning, use backward fill
        data = data.fillna(method='bfill')
    
    # Add date-based features
    data['Date'] = data.index
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Month'] = data['Date'].dt.month
    data['Year'] = data['Date'].dt.year
    data['DayOfYear'] = data['Date'].dt.dayofyear
    
    # Define target column based on availability
    target_col = 'Adj Close' if 'Adj Close' in data.columns else 'Close'
    
    # Calculate technical indicators
    # Moving averages
    data['MA5'] = data[target_col].rolling(window=5).mean()
    data['MA20'] = data[target_col].rolling(window=20).mean()
    data['MA50'] = data[target_col].rolling(window=50).mean()
    
    # Exponential moving averages
    data['EMA12'] = data[target_col].ewm(span=12, adjust=False).mean()
    data['EMA26'] = data[target_col].ewm(span=26, adjust=False).mean()
    
    # MACD (Moving Average Convergence Divergence)
    data['MACD'] = data['EMA12'] - data['EMA26']
    data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()
    
    # Bollinger Bands
    data['BB_middle'] = data[target_col].rolling(window=20).mean()
    data['BB_std'] = data[target_col].rolling(window=20).std()
    data['BB_upper'] = data['BB_middle'] + (data['BB_std'] * 2)
    data['BB_lower'] = data['BB_middle'] - (data['BB_std'] * 2)
    data['BB_width'] = (data['BB_upper'] - data['BB_lower']) / data['BB_middle']
    
    # RSI (Relative Strength Index)
    delta = data[target_col].diff()
    gain = delta.where(delta > 0, 0).rolling(window=14).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=14).mean()
    rs = gain / loss
    data['RSI'] = 100 - (100 / (1 + rs))
    
    # Calculate volatility (standard deviation over the past 20 days)
    data['Volatility'] = data[target_col].rolling(window=20).std()
    
    # Calculate percentage change
    data['Returns'] = data[target_col].pct_change()
    
    # Add price differences
    if all(col in data.columns for col in ['Open', 'High', 'Low', 'Close']):
        data['PriceRange'] = data['High'] - data['Low']
        data['DailyChange'] = data['Close'] - data['Open']
        data['CloseRatio'] = data['Close'] / data['Open']
        data['HL_PCT'] = (data['High'] - data['Low']) / data['Close'] * 100
        data['PCT_change'] = (data['Close'] - data['Open']) / data['Open'] * 100
    
    # Add volume indicators if volume data is available
    if 'Volume' in data.columns:
        data['VolumeSMA5'] = data['Volume'].rolling(window=5).mean()
        data['VolumeChange'] = data['Volume'].pct_change()
        # Price-volume trend
        data['PVT'] = (data[target_col].pct_change() * data['Volume']).cumsum()
    
    # Drop rows with NaN values (first rows due to rolling calculations)
    data = data.dropna()
    
    # Store the date index before any reset_index operations
    data['DateIndex'] = data.index
    
    return data, target_col


In [3]:

# 2. FEATURE ENGINEERING FOR TIME SERIES

def create_sequences(data, n_steps):
    """
    Create sequences for time series prediction
    
    Parameters:
    - data: Scaled data array
    - n_steps: Number of time steps to use as input features
    
    Returns:
    - X: Input sequences
    - y: Target values
    """
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:(i + n_steps), 0])
        y.append(data[i + n_steps, 0])
    return np.array(X), np.array(y)

def prepare_lstm_data(data, target_col='Close', n_steps=60, test_size=0.2):
    """
    Prepare data for LSTM model
    
    Parameters:
    - data: DataFrame with stock data
    - target_col: Target column to predict
    - n_steps: Number of past time steps to use as features
    - test_size: Proportion of data to use for testing
    
    Returns:
    - Dictionary containing all prepared data and scalers
    """
    # Extract the target column
    dataset = data[target_col].values.reshape(-1, 1)
    
    # Scale the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset_scaled = scaler.fit_transform(dataset)
    
    # Create sequences
    X, y = create_sequences(dataset_scaled, n_steps)
    
    # Reshape X to the format [samples, time steps, features]
    X = X.reshape(X.shape[0], X.shape[1], 1)
    
    # Split into train and test sets
    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    # Get corresponding dates for plotting
    dates = data['DateIndex'].values[n_steps:]
    train_dates = dates[:split_idx]
    test_dates = dates[split_idx:]
    
    return {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler,
        'train_dates': train_dates,
        'test_dates': test_dates,
        'all_dates': dates,
        'n_steps': n_steps
    }

def prepare_traditional_ml_data(data, target_col='Close', test_size=0.2):
    """
    Prepare data for traditional ML models
    
    Parameters:
    - data: DataFrame with stock data
    - target_col: Target column to predict
    - test_size: Proportion of data to use for testing
    
    Returns:
    - Dictionary containing prepared data
    """
    # Define features and target
    feature_cols = [col for col in data.columns if col not in [target_col, 'Date', 'DateIndex']]
    X = data[feature_cols].values
    y = data[target_col].values
    
    # Scale features
    scaler_X = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    
    # Scale target
    scaler_y = MinMaxScaler()
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=test_size, shuffle=False
    )
    
    # Get corresponding dates for plotting
    dates = data['DateIndex'].values
    split_idx = len(X_train)
    train_dates = dates[:split_idx]
    test_dates = dates[split_idx:]
    
    return {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'scaler_X': scaler_X,
        'scaler_y': scaler_y,
        'train_dates': train_dates,
        'test_dates': test_dates,
        'feature_cols': feature_cols
    }


In [4]:

# 3. MODEL DEVELOPMENT

def build_lstm_model(input_shape):
    """
    Build an LSTM model for time series prediction
    
    Parameters:
    - input_shape: Shape of input data (time steps, features)
    
    Returns:
    - Compiled LSTM model
    """
    model = Sequential()
    
    # First LSTM layer with Dropout
    model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    
    # Second LSTM layer with Dropout
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    
    # Dense output layer
    model.add(Dense(units=1))
    
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

def train_lstm_model(prepared_data, epochs=50, batch_size=32, verbose=1):
    """
    Train the LSTM model
    
    Parameters:
    - prepared_data: Dictionary with prepared data from prepare_lstm_data()
    - epochs: Number of training epochs
    - batch_size: Training batch size
    - verbose: Verbosity mode
    
    Returns:
    - Trained model and history
    """
    # Extract data
    X_train = prepared_data['X_train']
    y_train = prepared_data['y_train']
    X_test = prepared_data['X_test']
    y_test = prepared_data['y_test']
    
    # Define input shape
    input_shape = (X_train.shape[1], X_train.shape[2])
    
    # Build model
    model = build_lstm_model(input_shape)
    
    # Define early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=verbose
    )
    
    return model, history

def train_random_forest(prepared_data, n_estimators=100, verbose=1):
    """
    Train a Random Forest model
    
    Parameters:
    - prepared_data: Dictionary with prepared data from prepare_traditional_ml_data()
    - n_estimators: Number of trees in the forest
    - verbose: Verbosity mode
    
    Returns:
    - Trained model
    """
    # Extract data
    X_train = prepared_data['X_train']
    y_train = prepared_data['y_train']
    
    # Build and train model
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    if verbose:
        print("Training Random Forest model...")
    model.fit(X_train, y_train)
    if verbose:
        print("Training completed!")
    
    return model



In [5]:
# 4. MODEL EVALUATION

def evaluate_model(y_true, y_pred, model_name="Model"):
    """
    Evaluate model performance
    
    Parameters:
    - y_true: True values
    - y_pred: Predicted values
    - model_name: Name of the model for display
    
    Returns:
    - Dictionary with evaluation metrics
    """
    mse = mean_squared_error(y_true, y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name} Performance Metrics:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

def predict_lstm(model, prepared_data):
    """
    Make predictions using the LSTM model
    
    Parameters:
    - model: Trained LSTM model
    - prepared_data: Dictionary with prepared data
    
    Returns:
    - Dictionary with predictions and true values
    """
    # Extract data
    X_train = prepared_data['X_train']
    y_train = prepared_data['y_train']
    X_test = prepared_data['X_test']
    y_test = prepared_data['y_test']
    scaler = prepared_data['scaler']
    
    # Make predictions
    train_predict = model.predict(X_train)
    test_predict = model.predict(X_test)
    
    # Inverse transform predictions
    train_predict = scaler.inverse_transform(train_predict)
    test_predict = scaler.inverse_transform(test_predict)
    
    # Inverse transform true values
    y_train_inv = scaler.inverse_transform(y_train.reshape(-1, 1))
    y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))
    
    return {
        'train_predict': train_predict.flatten(),
        'test_predict': test_predict.flatten(),
        'y_train': y_train_inv.flatten(),
        'y_test': y_test_inv.flatten()
    }

def predict_rf(model, prepared_data):
    """
    Make predictions using the Random Forest model
    
    Parameters:
    - model: Trained Random Forest model
    - prepared_data: Dictionary with prepared data
    
    Returns:
    - Dictionary with predictions and true values
    """
    # Extract data
    X_train = prepared_data['X_train']
    y_train = prepared_data['y_train']
    X_test = prepared_data['X_test']
    y_test = prepared_data['y_test']
    scaler_y = prepared_data['scaler_y']
    
    # Make predictions
    train_predict_scaled = model.predict(X_train)
    test_predict_scaled = model.predict(X_test)
    
    # Reshape for inverse transform
    train_predict_scaled = train_predict_scaled.reshape(-1, 1)
    test_predict_scaled = test_predict_scaled.reshape(-1, 1)
    y_train_reshaped = y_train.reshape(-1, 1)
    y_test_reshaped = y_test.reshape(-1, 1)
    
    # Inverse transform predictions and true values
    train_predict = scaler_y.inverse_transform(train_predict_scaled).flatten()
    test_predict = scaler_y.inverse_transform(test_predict_scaled).flatten()
    y_train_inv = scaler_y.inverse_transform(y_train_reshaped).flatten()
    y_test_inv = scaler_y.inverse_transform(y_test_reshaped).flatten()
    
    return {
        'train_predict': train_predict,
        'test_predict': test_predict,
        'y_train': y_train_inv,
        'y_test': y_test_inv
    }



In [6]:
# 5. VISUALIZATION

def plot_training_history(history):
    """
    Plot LSTM training history
    
    Parameters:
    - history: History object from model.fit()
    """
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss During Training')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_predictions(predictions, dates, model_name="Model"):
    """
    Plot model predictions against actual values
    
    Parameters:
    - predictions: Dictionary with predictions from predict_lstm() or predict_rf()
    - dates: Dictionary with dates from prepared data
    - model_name: Name of the model for display
    """
    train_dates = dates['train_dates']
    test_dates = dates['test_dates']
    
    plt.figure(figsize=(16, 8))
    
    # Plot training data
    plt.plot(train_dates, predictions['y_train'], label='Actual (Training)')
    plt.plot(train_dates, predictions['train_predict'], label='Predicted (Training)')
    
    # Plot testing data
    plt.plot(test_dates, predictions['y_test'], label='Actual (Testing)', color='green')
    plt.plot(test_dates, predictions['test_predict'], label='Predicted (Testing)', color='red')
    
    # Add vertical line to separate training and testing periods
    split_date = test_dates[0]
    plt.axvline(x=split_date, color='black', linestyle='--')
    plt.text(split_date, plt.ylim()[1]*0.9, 'Train/Test Split', 
             horizontalalignment='center', backgroundcolor='white')
    
    plt.title(f'{model_name} - Stock Price Prediction')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def plot_multiple_predictions(predictions_list, dates, model_names):
    """
    Plot predictions from multiple models for comparison
    
    Parameters:
    - predictions_list: List of prediction dictionaries
    - dates: Dictionary with dates from prepared data
    - model_names: List of model names
    """
    test_dates = dates['test_dates']
    
    plt.figure(figsize=(16, 8))
    
    # Plot actual test data
    plt.plot(test_dates, predictions_list[0]['y_test'], label='Actual', linewidth=2)
    
    # Plot predictions from each model
    for i, predictions in enumerate(predictions_list):
        plt.plot(test_dates, predictions['test_predict'], label=f'Predicted ({model_names[i]})', linestyle='--')
    
    plt.title('Model Comparison - Stock Price Prediction')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def plot_forecast(last_sequence, model, scaler, n_steps, n_forecast=30, actual_data=None):
    """
    Generate and plot future stock price forecast
    
    Parameters:
    - last_sequence: Last sequence from the original data
    - model: Trained LSTM model
    - scaler: Scaler used on the original data
    - n_steps: Number of time steps used in sequences
    - n_forecast: Number of days to forecast
    - actual_data: Actual future data for comparison (if available)
    """
    # Make a copy of the last sequence
    curr_sequence = last_sequence.copy()
    
    # List to store forecasted values
    forecasted_values = []
    
    # Generate forecasts
    for _ in range(n_forecast):
        # Get prediction (scaled)
        pred = model.predict(curr_sequence.reshape(1, n_steps, 1))
        
        # Append prediction to the sequence and remove the first element
        curr_sequence = np.append(curr_sequence[1:], pred[0])
        
        # Store the forecasted value (unscaled)
        forecasted_values.append(scaler.inverse_transform([[pred[0, 0]]])[0, 0])
    
    # Generate dates for the forecast period
    last_date = datetime.now()
    forecast_dates = [last_date + timedelta(days=i) for i in range(1, n_forecast + 1)]
    
    # Plot the forecast
    plt.figure(figsize=(16, 8))
    
    # Plot actual data if available
    if actual_data is not None:
        plt.plot(actual_data['dates'], actual_data['values'], label='Actual', color='green')
    
    # Plot forecasted data
    plt.plot(forecast_dates, forecasted_values, label='Forecast', color='red', linestyle='--')
    
    plt.title('Stock Price Forecast')
    plt.xlabel('Date')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return forecasted_values, forecast_dates


In [7]:

# 6. MAIN EXECUTION

def list_available_tickers(zip_file_path, data_type='both', limit=20):
    """
    List available ticker symbols in the dataset
    
    Parameters:
    - zip_file_path: Path to the downloaded zip file
    - data_type: Type of data to list - 'stocks', 'etfs', or 'both' (default)
    - limit: Maximum number of tickers to list per type
    
    Returns:
    - Dictionary with available tickers by type
    """
    if not os.path.exists(zip_file_path):
        print(f"Zip file not found at {zip_file_path}")
        return {}
    
    result = {}
    
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as z:
            # Process stocks if requested
            if data_type.lower() in ['stocks', 'both']:
                stock_tickers = [name.split('/')[-1].split('.us.txt')[0] for name in z.namelist() 
                               if name.startswith('stocks/') and name.endswith('.us.txt')]
                stock_tickers.sort()
                result['stocks'] = stock_tickers
                
                if limit and len(stock_tickers) > limit:
                    print(f"Found {len(stock_tickers)} stock tickers. Showing first {limit}:")
                    for ticker in stock_tickers[:limit]:
                        print(f"  - {ticker}")
                    print(f"  ... and {len(stock_tickers) - limit} more")
                else:
                    print(f"Found {len(stock_tickers)} stock tickers")
            
            # Process ETFs if requested
            if data_type.lower() in ['etfs', 'both']:
                etf_tickers = [name.split('/')[-1].split('.us.txt')[0] for name in z.namelist() 
                             if name.startswith('etfs/') and name.endswith('.us.txt')]
                etf_tickers.sort()
                result['etfs'] = etf_tickers
                
                if limit and len(etf_tickers) > limit:
                    print(f"Found {len(etf_tickers)} ETF tickers. Showing first {limit}:")
                    for ticker in etf_tickers[:limit]:
                        print(f"  - {ticker}")
                    print(f"  ... and {len(etf_tickers) - limit} more")
                else:
                    print(f"Found {len(etf_tickers)} ETF tickers")
            
            # Try to get metadata file
            meta_file = "symbols_valid_meta.csv"
            if meta_file in z.namelist():
                print(f"Metadata file found: symbols_valid_meta.csv")
                result['has_metadata'] = True
            else:
                print(f"Metadata file not found")
                result['has_metadata'] = False
                
            return result
    except Exception as e:
        print(f"Error listing tickers: {e}")
        return {}

def main(zip_file_path="../data/STOCK_PRICE_PREDICTION.zip", ticker="AAPL"):
    """
    Main execution function
    
    Parameters:
    - zip_file_path: Path to the downloaded zip file
    - ticker: Stock symbol to analyze
    """
    print(f"Starting stock price prediction for {ticker} using Kaggle Stock Market Dataset")
    
    # 1. Load stock data
    df = load_stock_data(zip_file_path, ticker=ticker)
    
    if df is None:
        print("Failed to load data. Listing available tickers...")
        available_tickers = list_available_tickers(zip_file_path)
        print(available_tickers)
        print("\nPlease choose one of the available tickers and try again.")
        return
    
    # Basic info about the data
    print("\nData Information:")
    print(df.info())
    print("\nFirst few rows:")
    print(df.head())
    
    # 2. Preprocess data
    print("\nPreprocessing data...")
    processed_data = preprocess_data(df)
    
    # 3. Prepare data for models
    print("\nPreparing data for LSTM model...")
    lstm_data = prepare_lstm_data(processed_data, n_steps=60)
    
    print("\nPreparing data for traditional ML models...")
    trad_data = prepare_traditional_ml_data(processed_data)
    
    # 4. Train LSTM model
    print("\nTraining LSTM model...")
    lstm_model, history = train_lstm_model(lstm_data, epochs=50)
    
    # 5. Train Random Forest model
    print("\nTraining Random Forest model...")
    rf_model = train_random_forest(trad_data)
    
    # 6. Make predictions
    print("\nMaking predictions with LSTM model...")
    lstm_predictions = predict_lstm(lstm_model, lstm_data)
    
    print("\nMaking predictions with Random Forest model...")
    rf_predictions = predict_rf(rf_model, trad_data)
    
    # 7. Evaluate models
    print("\nEvaluating LSTM model...")
    lstm_metrics = evaluate_model(lstm_predictions['y_test'], lstm_predictions['test_predict'], "LSTM")
    
    print("\nEvaluating Random Forest model...")
    rf_metrics = evaluate_model(rf_predictions['y_test'], rf_predictions['test_predict'], "Random Forest")
    
    # 8. Plot results
    print("\nPlotting training history...")
    plot_training_history(history)
    
    print("\nPlotting LSTM predictions...")
    lstm_dates = {'train_dates': lstm_data['train_dates'], 'test_dates': lstm_data['test_dates']}
    plot_predictions(lstm_predictions, lstm_dates, "LSTM")
    
    print("\nPlotting Random Forest predictions...")
    rf_dates = {'train_dates': trad_data['train_dates'], 'test_dates': trad_data['test_dates']}
    plot_predictions(rf_predictions, rf_dates, "Random Forest")
    
    print("\nPlotting model comparison...")
    model_names = ["LSTM", "Random Forest"]
    predictions_list = [lstm_predictions, rf_predictions]
    dates = {'test_dates': lstm_data['test_dates']}  # Assuming same test dates for both
    plot_multiple_predictions(predictions_list, dates, model_names)
    
    # 9. Generate future forecast (next 30 days)
    print("\nGenerating future forecast with LSTM model...")
    # Last sequence from the test data
    last_sequence = lstm_data['X_test'][-1]
    last_date = lstm_data['test_dates'][-1]
    forecasted_values, forecast_dates = plot_forecast(last_sequence, lstm_model, lstm_data['scaler'], lstm_data['n_steps'], n_forecast=30)
    
    print(f"\nForecasted stock prices for {ticker} for the next 30 days after {last_date.strftime('%Y-%m-%d')}:")
    for i, (date, price) in enumerate(zip(forecast_dates, forecasted_values), 1):
        print(f"Day {i}: {date.strftime('%Y-%m-%d')} - ${price:.2f}")
    
    # 10. Feature importance from Random Forest (for insights)
    if hasattr(rf_model, 'feature_importances_'):
        print("\nFeature Importances from Random Forest:")
        feature_cols = trad_data['feature_cols']
        importances = rf_model.feature_importances_
        feature_importance = pd.DataFrame({'Feature': feature_cols, 'Importance': importances})
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        print(feature_importance.head(10))
        
        # Plot feature importances
        plt.figure(figsize=(12, 6))
        plt.barh(feature_importance['Feature'].head(10), feature_importance['Importance'].head(10))
        plt.xlabel('Importance')
        plt.title(f'Top 10 Feature Importances for {ticker} Stock Price Prediction')
        plt.gca().invert_yaxis()  # Invert to have the most important at the top
        plt.tight_layout()
        plt.show()
    
    print("\nAnalysis complete!")

if __name__ == "__main__":
    # You need to download the stock market dataset from Kaggle:
    # https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset
    # And provide the path to the downloaded zip file
    
    # Example usage:
    # 1. Just specify the path to the downloaded zip file (will use AAPL by default)
    # main("path/to/stock_market_data.zip")
    
    # 2. Specify both the path and ticker
    # main("path/to/stock_market_data.zip", "MSFT")
    
    # To list available tickers in the dataset:
    # list_available_tickers("path/to/stock_market_data.zip")
    
    # Replace with your actual file path
    # For example:
    # main("./stock_market_data.zip", "AAPL")
    
    # print("Please update the script with the path to your downloaded dataset zip file.")
    # print("Example usage: main('./stock_market_data.zip', 'AAPL')")
    
    # Uncomment and update the line below with your local file path
    main("../data/STOCK_PRICE_PREDICTION.zip", "AAPL")

Starting stock price prediction for AAPL using Kaggle Stock Market Dataset
Loading AAPL stocks data from zip file...
No data for ticker AAPL in the stocks folder
Failed to load data. Listing available tickers...
Found 0 stock tickers
Found 0 ETF tickers
Metadata file found: symbols_valid_meta.csv
{'stocks': [], 'etfs': [], 'has_metadata': True}

Please choose one of the available tickers and try again.
