In [None]:
# Importing the required libraries
import numpy as np
import pandas as pd
import yfinance as yf

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop

import tensorflow as tf
import csv

from itertools import combinations

# Error Suppression
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from sklearn.model_selection import ParameterGrid

# Define the fixed hyperparameters
fixed_params = {
    'filters': 64,  # Number of filters in Conv1D layers
    'kernel_size': 3,  # Kernel size for Conv1D layers
    'dropout_rate': 0.2,  # Dropout rate
    'learning_rate': 0.001,  # Learning rate
    'batch_size': 32,  # Batch size
    'epochs': 50,  # Number of epochs
    'optimizer': 'adam'  # Optimizer
}


def download_stock_data(ticker, start_date='2010-01-01', end_date='2024-12-31'):
    # Calculate extra buffer period (e.g., 1 year)
    buffer_start = pd.to_datetime(start_date) - pd.DateOffset(years=1)
    
    # Download data with buffer period
    data = yf.download(ticker, start=buffer_start.strftime('%Y-%m-%d'), end=end_date)

    # Fetch earnings report dates
    ticker_data = yf.Ticker(ticker)
    earnings_dates = set(pd.to_datetime(ticker_data.earnings_dates.index).normalize())  # Normalize dates

    # Fetch ex-dividend dates
    ex_dividend_dates = set(pd.to_datetime(ticker_data.dividends.index).normalize())

    # Combine both sets
    dates_to_remove = earnings_dates.union(ex_dividend_dates)

    # Ensure data index is datetime and normalized
    data.index = pd.to_datetime(data.index).normalize()

    # Remove both earnings & ex-dividend dates
    data = data[~data.index.isin(dates_to_remove)]

    # Trim the dataset to the actual requested start_date
    data = data.loc[start_date:]

    print(dates_to_remove)

    return data

def add_technical_indicators(ticker, df):
    # Ensure calculations don't cause issues with chained assignments
    pd.options.mode.chained_assignment = None  

    # Moving Averages
    df['20MA'] = df['Close'].rolling(window=20, min_periods=1).mean()
    df['50MA'] = df['Close'].rolling(window=50, min_periods=1).mean()
    df['200MA'] = df['Close'].rolling(window=200, min_periods=1).mean()

    # Relative Strength Index (RSI)
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0).rolling(window=14, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14, min_periods=1).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    df['RSI'].fillna(50, inplace=True)

    # MACD and Signal Line
    df['12EMA'] = df['Close'].ewm(span=12, adjust=False, min_periods=1).mean()
    df['26EMA'] = df['Close'].ewm(span=26, adjust=False, min_periods=1).mean()
    df['MACD'] = df['12EMA'] - df['26EMA']
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False, min_periods=1).mean()

    # Bollinger Bands
    df['20STD'] = df['Close'].rolling(window=20, min_periods=1).std()
    df['Upper_BB'] = df['20MA'] + (df['20STD'] * 2)
    df['Lower_BB'] = df['20MA'] - (df['20STD'] * 2)

    # Commodity Channel Index (CCI)
    typical_price = (df['High'] + df['Low'] + df['Close']) / 3
    mean_dev = lambda x: np.mean(np.abs(x - np.mean(x)))
    df['CCI'] = (typical_price - typical_price.rolling(window=20, min_periods=1).mean()) / \
                (0.015 * typical_price.rolling(window=20, min_periods=1).apply(mean_dev, raw=True))

    # Average True Range (ATR)
    df['TR'] = np.maximum(df['High'] - df['Low'], 
                          np.maximum(abs(df['High'] - df['Close'].shift(1)), 
                                     abs(df['Low'] - df['Close'].shift(1))))
    df['ATR'] = df['TR'].rolling(window=14, min_periods=1).mean()
    df.drop(columns=['TR'], inplace=True)  # Drop intermediate column

    # Rate of Change (ROC)
    df['ROC'] = df['Close'].pct_change(periods=10) * 100

    # Williams %R
    df['Williams_%R'] = ((df['High'].rolling(window=14, min_periods=1).max() - df['Close']) / 
                          (df['High'].rolling(window=14, min_periods=1).max() - df['Low'].rolling(window=14, min_periods=1).min())) * -100

    # On-Balance Volume (OBV)
    df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

    # Drop intermediate columns
    df.drop(columns=['20STD'], inplace=True)

    # Forward-fill and backward-fill NaN values
    df.ffill(inplace=True)
    df.bfill(inplace=True)

    # Verify no NaN values exist
    print("Null values in each column:\n", df.isnull().sum())
    print(f"Does the dataset contain any null values? {df.isnull().values.any()}")

    folder_name = "check"
    os.makedirs(folder_name, exist_ok=True)
    filename = os.path.join(folder_name, f"{ticker}_CNN.csv")
    df.to_csv(filename, index=False)

    return df  # Original DataFrame is modified, so return is optional

def normalize_data(df):
    all_indicators = [
        '20MA', '50MA', '200MA', 'RSI', 'MACD', 'Signal_Line', 
        'Upper_BB', 'Lower_BB', 'CCI', 'ATR', 'ROC', 'Williams_%R', 'OBV'
    ]

    # Generate all possible combinations of 5 indicators + 'Close'
    indicator_combinations = list(combinations(all_indicators, 5))

    # Dictionary to store performance results
    results = {}

    all_scaled_data = {}  # Dictionary to store scaled data for each combination

    for selected_indicators in indicator_combinations:
        selected_features = ['Close'] + list(selected_indicators)  # Always include Close

        # Normalize selected features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data = scaler.fit_transform(df[selected_features])

        # Store the scaled data
        all_scaled_data[selected_indicators] = scaled_data

    return df, all_scaled_data  # Return the full dataset and all scaled variations

# Prepare the time-series data
def create_time_series_data(df, scaled_data, window_size=120):
    X, y = [], []
    for i in range(window_size, len(df)):
        X.append(scaled_data[i-window_size:i])
        y.append(df['Close'].iloc[i])
    return np.array(X), np.array(y)

# Split data into training and testing sets (80% training,20% testing)
def split_data(X, y, train_size=0.7, val_size=0.1, test_size=0.2):
    train_end = int(len(X) * train_size)
    val_end = train_end + int(len(X) * val_size)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]

    return X_train, X_val, X_test, y_train, y_val, y_test

# Function to create a CNN model with fixed hyperparameters
def create_cnn_model(input_shape, params=fixed_params):
    # Initialize the model
    model = Sequential()

    # Add Input layer
    model.add(Input(shape=input_shape))

    # First Conv1D layer
    model.add(Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(rate=params['dropout_rate']))

    # Second Conv1D layer
    model.add(Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(rate=params['dropout_rate']))

    # Flatten the output
    model.add(Flatten())

    # Output layer
    model.add(Dense(units=1))  # Assuming you're predicting a single value

    # Choose optimizer
    if params['optimizer'] == 'adam':
        opt = Adam(learning_rate=params['learning_rate'])
    elif params['optimizer'] == 'rmsprop':
        opt = RMSprop(learning_rate=params['learning_rate'])
    else:
        raise ValueError("Optimizer must be 'adam' or 'rmsprop'.")

    # Compile the model
    model.compile(optimizer=opt, loss='mean_squared_error')

    return model

# Define tickers
tickers = ['AAPL', 'BRK-B', 'XOM',  '1155.KL',  '0270.KL', '5681.KL']

# Dictionary to store results
results = {}

# Define the output folder
output_folder = "stock_results"
os.makedirs(output_folder, exist_ok=True)  # Ensure folder exists

for ticker in tickers:
    print(f"Running model for {ticker}...\n")

    df = download_stock_data(ticker)
    df = add_technical_indicators(ticker, df)
    df, all_scaled_data = normalize_data(df)

    output_file = os.path.join(output_folder, f"{ticker}_results_CNN.csv")

    with open(output_file, mode='w', newline='') as file:
        header = ["Ticker", "Indicators", "Filters", "Kernel Size", "Dropout Rate", "Learning Rate", "Batch Size", "Epochs", "Optimizer", "MSE", "RMSE", "MAE", "MAPE"]
        writer = csv.writer(file)
        writer.writerow(header)  # Write header

        for selected_indicators, scaled_data in all_scaled_data.items():
            print(f"Training with indicators: {selected_indicators} and fixed parameters: {fixed_params}")
            X, y = create_time_series_data(df, scaled_data)

            X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

            # Scale target values (y)
            scaler_y = MinMaxScaler(feature_range=(0, 1))
            y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
            y_val_scaled = scaler_y.transform(y_val.reshape(-1, 1))
            y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

            input_shape = (X_train.shape[1], X_train.shape[2])  # Input shape for CNN

            # Create and train the model with the fixed hyperparameters
            model = create_cnn_model(input_shape=input_shape)

            early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

            history = model.fit(
                X_train, y_train_scaled,
                epochs=fixed_params['epochs'],
                batch_size=fixed_params['batch_size'],
                validation_data=(X_val, y_val_scaled),
                callbacks=[early_stopping],
                verbose=0
            )

            # Evaluate the model
            y_pred_scaled = model.predict(X_test)
            y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))
            y_test_original = scaler_y.inverse_transform(y_test_scaled)

            mse = mean_squared_error(y_test_original, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test_original, y_pred)
            mape = mean_absolute_percentage_error(y_test_original, y_pred) * 100

            # Write the results to the CSV file
            writer.writerow([
                ticker,  # Ticker symbol
                ', '.join(selected_indicators),  # Selected indicators
                fixed_params['filters'],  # Filters
                fixed_params['kernel_size'],  # Kernel size
                fixed_params['dropout_rate'],  # Dropout rate
                fixed_params['learning_rate'],  # Learning rate
                fixed_params['batch_size'],  # Batch size
                fixed_params['epochs'],  # Number of epochs
                fixed_params['optimizer'],  # Optimizer
                mse,  # MSE
                rmse,  # RMSE
                mae,  # MAE
                mape  # MAPE
            ])

            # Flush the file buffer to ensure data is written
            file.flush()

            print(f"{ticker} - RMSE: {rmse:.4f}, MSE: {mse:.4f}, MAE: {mae:.4f}, MAPE: {mape:.4f}\n")

2025-03-17 21:47:57.492472: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-17 21:47:58.051219: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-17 21:48:00.402924: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Running model for AAPL...

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


{Timestamp('1991-11-18 00:00:00-0500', tz='America/New_York'), Timestamp('2023-05-03 00:00:00-0400', tz='America/New_York'), Timestamp('2024-11-08 00:00:00-0500', tz='America/New_York'), Timestamp('2014-02-06 00:00:00-0500', tz='America/New_York'), Timestamp('1995-05-26 00:00:00-0400', tz='America/New_York'), Timestamp('1990-11-16 00:00:00-0500', tz='America/New_York'), Timestamp('2019-11-07 00:00:00-0500', tz='America/New_York'), Timestamp('2022-11-04 00:00:00-0400', tz='America/New_York'), Timestamp('2022-05-06 00:00:00-0400', tz='America/New_York'), Timestamp('1987-08-10 00:00:00-0400', tz='America/New_York'), Timestamp('2019-05-10 00:00:00-0400', tz='America/New_York'), Timestamp('2015-11-05 00:00:00-0500', tz='America/New_York'), Timestamp('2021-02-05 00:00:00-0500', tz='America/New_York'), Timestamp('2024-07-31 00:00:00-0400', tz='America/New_York'), Timestamp('2025-10-28 00:00:00-0400', tz='America/New_York'), Timestamp('2014-05-08 00:00:00-0400', tz='America/New_York'), Timesta