In [1]:
#Importing the required libraries
import numpy as np
import pandas as pd
import yfinance as yf

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

import tensorflow as tf
import csv

from itertools import combinations

#Error Supression

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')


def download_stock_data(ticker, start_date='2010-01-01', end_date='2024-12-31'):
    # Calculate extra buffer period (e.g., 1 year)
    buffer_start = pd.to_datetime(start_date) - pd.DateOffset(years=1)
    
    # Download data with buffer period
    data = yf.download(ticker, start=buffer_start.strftime('%Y-%m-%d'), end=end_date)

    # Fetch earnings report dates
    ticker_data = yf.Ticker(ticker)
    earnings_dates = set(pd.to_datetime(ticker_data.earnings_dates.index).normalize())  # Normalize dates

    # Fetch ex-dividend dates
    ex_dividend_dates = set(pd.to_datetime(ticker_data.dividends.index).normalize())

    # Combine both sets
    dates_to_remove = earnings_dates.union(ex_dividend_dates)

    # Ensure data index is datetime and normalized
    data.index = pd.to_datetime(data.index).normalize()

    # Remove both earnings & ex-dividend dates
    data = data[~data.index.isin(dates_to_remove)]

    # Trim the dataset to the actual requested start_date
    data = data.loc[start_date:]

    print(dates_to_remove)

    return data


def add_technical_indicators(ticker, df):
    # Ensure calculations don't cause issues with chained assignments
    pd.options.mode.chained_assignment = None  

    # Moving Averages
    df['20MA'] = df['Close'].rolling(window=20, min_periods=1).mean()
    df['50MA'] = df['Close'].rolling(window=50, min_periods=1).mean()
    df['200MA'] = df['Close'].rolling(window=200, min_periods=1).mean()

    # Relative Strength Index (RSI)
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0).rolling(window=14, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14, min_periods=1).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    df['RSI'].fillna(50, inplace=True)

    # MACD and Signal Line
    df['12EMA'] = df['Close'].ewm(span=12, adjust=False, min_periods=1).mean()
    df['26EMA'] = df['Close'].ewm(span=26, adjust=False, min_periods=1).mean()
    df['MACD'] = df['12EMA'] - df['26EMA']
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False, min_periods=1).mean()

    # Bollinger Bands
    df['20STD'] = df['Close'].rolling(window=20, min_periods=1).std()
    df['Upper_BB'] = df['20MA'] + (df['20STD'] * 2)
    df['Lower_BB'] = df['20MA'] - (df['20STD'] * 2)

    # Commodity Channel Index (CCI)
    typical_price = (df['High'] + df['Low'] + df['Close']) / 3
    mean_dev = lambda x: np.mean(np.abs(x - np.mean(x)))
    df['CCI'] = (typical_price - typical_price.rolling(window=20, min_periods=1).mean()) / \
                (0.015 * typical_price.rolling(window=20, min_periods=1).apply(mean_dev, raw=True))

    # Average True Range (ATR)
    df['TR'] = np.maximum(df['High'] - df['Low'], 
                          np.maximum(abs(df['High'] - df['Close'].shift(1)), 
                                     abs(df['Low'] - df['Close'].shift(1))))
    df['ATR'] = df['TR'].rolling(window=14, min_periods=1).mean()
    df.drop(columns=['TR'], inplace=True)  # Drop intermediate column

    # Rate of Change (ROC)
    df['ROC'] = df['Close'].pct_change(periods=10) * 100

    # Williams %R
    df['Williams_%R'] = ((df['High'].rolling(window=14, min_periods=1).max() - df['Close']) / 
                          (df['High'].rolling(window=14, min_periods=1).max() - df['Low'].rolling(window=14, min_periods=1).min())) * -100

    # On-Balance Volume (OBV)
    df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

    # Drop intermediate columns
    df.drop(columns=['20STD'], inplace=True)

    # Forward-fill and backward-fill NaN values
    df.ffill(inplace=True)
    df.bfill(inplace=True)

    # Verify no NaN values exist
    print("Null values in each column:\n", df.isnull().sum())
    print(f"Does the dataset contain any null values? {df.isnull().values.any()}")

    folder_name = "check"
    os.makedirs(folder_name, exist_ok=True)
    filename = os.path.join(folder_name, f"{ticker}.csv")
    df.to_csv(filename, index=False)

    return df  # Original DataFrame is modified, so return is optional


def normalize_data(df):
    all_indicators = [
        '20MA', '50MA', '200MA', 'RSI', 'MACD', 'Signal_Line', 
        'Upper_BB', 'Lower_BB', 'CCI', 'ATR', 'ROC', 'Williams_%R', 'OBV'
    ]

    # Generate all possible combinations of 5 indicators + 'Close'
    indicator_combinations = list(combinations(all_indicators, 5))

    # Dictionary to store performance results
    results = {}

    all_scaled_data = {}  # Dictionary to store scaled data for each combination

    for selected_indicators in indicator_combinations:
        selected_features = ['Close'] + list(selected_indicators)  # Always include Close

        # Normalize selected features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data = scaler.fit_transform(df[selected_features])

        # Store the scaled data
        all_scaled_data[selected_indicators] = scaled_data

    return df, all_scaled_data  # Return the full dataset and all scaled variations

# Prepare the time-series data
def create_time_series_data(df, scaled_data, window_size=120):
    X, y = [], []
    for i in range(window_size, len(df)):
        X.append(scaled_data[i-window_size:i])
        y.append(df['Close'].iloc[i])
    return np.array(X), np.array(y)

# Split data into training and testing sets (80% training,20% testing)
def split_data(X, y, train_size=0.7, val_size=0.1, test_size=0.2):
    train_end = int(len(X) * train_size)
    val_end = train_end + int(len(X) * val_size)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]

    return X_train, X_val, X_test, y_train, y_val, y_test

# Create LSTM model
def create_lstm_model(input_shape):
    model = Sequential()
    
    # Input layer
    model.add(Input(shape=input_shape))
    
    # LSTM Layer
    model.add(LSTM(units=50, return_sequences=True))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    
    # Fully connected output layer
    model.add(Dense(units=1))  # Predicting one value (the next day's closing price)
    
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')
    return model



# Define tickers
tickers = ['AAPL', 'NVDA', 'MSFT', 'BRK-B', 'JPM', 'V', 'XOM', 'CVX', 'COP']

# Dictionary to store results
results = {}

# Define the output folder
output_folder = "stock_results"
os.makedirs(output_folder, exist_ok=True)  # Ensure folder exists

for ticker in tickers:
    print(f"Running model for {ticker}...\n")

    df = download_stock_data(ticker)
    df = add_technical_indicators(ticker, df)
    df, all_scaled_data = normalize_data(df)

    print(f"all_scaled_data: {all_scaled_data}")  # Debugging print

    output_file = os.path.join(output_folder, f"{ticker}_results.csv")

    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Indicators", "MSE", "RMSE", "MAE", "MAPE"])  # Write header

        for selected_indicators, scaled_data in all_scaled_data.items():
            print(f"Training with indicators: {selected_indicators}")  # Debugging print

            X, y = create_time_series_data(df, scaled_data)
            print(f"X shape: {X.shape}, y shape: {y.shape}")  # Debugging print

            X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")  # Debugging print

            # Scale target values (y)
            scaler_y = MinMaxScaler(feature_range=(0, 1))
            y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
            y_val_scaled = scaler_y.transform(y_val.reshape(-1, 1))
            y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

            input_shape = (X_train.shape[1], X_train.shape[2])

            model = create_lstm_model(input_shape)
            early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

            history = model.fit(
                X_train, y_train_scaled,
                epochs=20, batch_size=32,
                validation_data=(X_val, y_val_scaled),
                callbacks=[early_stopping],
                verbose=1
            )

            print(f"Finished training for {selected_indicators}\n")  # Confirmation print

            y_pred_scaled = model.predict(X_test)
            y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))
            y_test_original = scaler_y.inverse_transform(y_test_scaled)

            mse = mean_squared_error(y_test_original, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test_original, y_pred)
            mape = mean_absolute_percentage_error(y_test_original, y_pred) * 100

            results[ticker] = {
                "RMSE": rmse,
                "MSE": mse,
                "MAE": mae,
                "MAPE": mape
            }

            writer.writerow([', '.join(selected_indicators), mse, rmse, mae, mape])

            print(f"{ticker} - RMSE: {rmse:.4f}, MSE: {mse:.4f}, MAE: {mae:.4f}, MAPE: {mape:.4f}\n")

            # # Plot actual vs predicted prices
            # plt.figure(figsize=(12, 6))
            # plt.plot(y_test_original, label="Actual Prices", color="blue", linewidth=2)
            # plt.plot(y_pred, label="Predicted Prices", color="red", linestyle="dashed", linewidth=2)
            # plt.title(f"{ticker} Stock Price Prediction (LSTM)")
            # plt.xlabel("Time")
            # plt.ylabel("Stock Price (USD)")
            # plt.legend()
            # plt.show()


2025-03-02 10:25:41.668208: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-02 10:25:41.685691: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-02 10:25:41.843246: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-02 10:25:41.981337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740882342.105521    2168 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740882342.14

Running model for AAPL...

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


{Timestamp('1991-11-18 00:00:00-0500', tz='America/New_York'), Timestamp('2023-05-03 00:00:00-0400', tz='America/New_York'), Timestamp('2024-11-08 00:00:00-0500', tz='America/New_York'), Timestamp('2014-02-06 00:00:00-0500', tz='America/New_York'), Timestamp('1995-05-26 00:00:00-0400', tz='America/New_York'), Timestamp('1990-11-16 00:00:00-0500', tz='America/New_York'), Timestamp('2019-11-07 00:00:00-0500', tz='America/New_York'), Timestamp('2022-11-04 00:00:00-0400', tz='America/New_York'), Timestamp('2022-05-06 00:00:00-0400', tz='America/New_York'), Timestamp('1987-08-10 00:00:00-0400', tz='America/New_York'), Timestamp('2019-05-10 00:00:00-0400', tz='America/New_York'), Timestamp('2015-11-05 00:00:00-0500', tz='America/New_York'), Timestamp('2021-02-05 00:00:00-0500', tz='America/New_York'), Timestamp('2024-07-31 00:00:00-0400', tz='America/New_York'), Timestamp('2025-10-28 00:00:00-0400', tz='America/New_York'), Timestamp('2014-05-08 00:00:00-0400', tz='America/New_York'), Timesta

2025-03-02 10:25:50.813434: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 83ms/step - loss: 0.0484 - val_loss: 0.5164
Epoch 2/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 79ms/step - loss: 0.0114 - val_loss: 0.0112
Epoch 3/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 79ms/step - loss: 0.0016 - val_loss: 0.0081
Epoch 4/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 78ms/step - loss: 0.0017 - val_loss: 0.0084
Epoch 5/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 79ms/step - loss: 0.0014 - val_loss: 0.0116
Epoch 6/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 80ms/step - loss: 0.0013 - val_loss: 0.0085
Epoch 7/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 80ms/step - loss: 0.0011 - val_loss: 0.0121
Epoch 8/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 83ms/step - loss: 0.0010 - val_loss: 0.0110
Finished training for ('20MA', '50MA', '200MA', 'RSI', 'MAC

KeyboardInterrupt: 