## Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings('ignore', category=DeprecationWarning)

## Technical Indicators Calculation

In [3]:
df = pd.read_csv('10_Year_Historical_Preprocessed.csv')

In [4]:
def exponential_moving_avg(df: pd.DataFrame, price_col='Adj Close', window_size=20):
    return df[price_col].ewm(span=window_size, adjust=False).mean()

def macd_line(df: pd.DataFrame, price_col='Adj Close', short_window=12, long_window=26):
    short_ema = exponential_moving_avg(df, price_col, short_window)
    long_ema = exponential_moving_avg(df, price_col, long_window)
    return short_ema - long_ema

def macd_signal(df: pd.DataFrame, price_col='Adj Close', signal_window=9, short_window=12, long_window=26):
    macd = macd_line(df, price_col, short_window, long_window)
    return macd.ewm(span=signal_window, adjust=False).mean()


In [5]:
df['EMA 20'] = df.groupby('Ticker').apply(lambda x: exponential_moving_avg(x, price_col='Adj Close', window_size=20)).reset_index(level=0, drop=True)
df['EMA 50'] = df.groupby('Ticker').apply(lambda x: exponential_moving_avg(x, price_col='Adj Close', window_size=50)).reset_index(level=0, drop=True)
df['MACD Line'] = df.groupby('Ticker').apply(lambda x: macd_line(x, price_col='Adj Close')).reset_index(level=0, drop=True)
df['MACD Signal'] = df.groupby('Ticker').apply(lambda x: macd_signal(x, price_col='Adj Close')).reset_index(level=0, drop=True)

## Feature Scaling

In [6]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].astype(float)

In [7]:
def scale_group(group):
    scaler = MinMaxScaler(feature_range=(0, 1))
    group.iloc[:, 2:] = scaler.fit_transform(group.iloc[:, 2:])
    return group

In [8]:
scaled_df = df.groupby('Ticker').apply(scale_group)
scaled_df = scaled_df.drop('Ticker', axis=1).reset_index().set_index('Date').drop('level_1', axis=1)

In [9]:
scaled_df

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Adj Close,Volume,EMA 20,EMA 50,MACD Line,MACD Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-10-20,AAL,0.508197,0.525155,0.519677,0.529638,0.505697,0.028508,0.529674,0.544502,0.607034,0.610477
2014-10-21,AAL,0.547181,0.575065,0.560444,0.582844,0.557537,0.058501,0.535072,0.546847,0.639224,0.617646
2014-10-22,AAL,0.585366,0.581880,0.579011,0.566458,0.541571,0.041206,0.538294,0.548377,0.654164,0.626708
2014-10-23,AAL,0.572971,0.621167,0.576186,0.595590,0.569956,0.074754,0.544164,0.551131,0.682956,0.640369
2014-10-24,AAL,0.596361,0.616556,0.612109,0.622699,0.596369,0.038575,0.552225,0.554972,0.721051,0.659782
...,...,...,...,...,...,...,...,...,...,...,...
2024-10-14,ZYXI,0.293774,0.293993,0.311802,0.304949,0.308423,0.009538,0.348974,0.407447,0.406825,0.412171
2024-10-15,ZYXI,0.302239,0.296963,0.323342,0.303791,0.307252,0.012973,0.349241,0.407083,0.411746,0.412160
2024-10-16,ZYXI,0.307625,0.295849,0.321694,0.301474,0.304910,0.008845,0.349227,0.406615,0.414378,0.412735
2024-10-17,ZYXI,0.304162,0.292136,0.320457,0.303791,0.307252,0.009155,0.349470,0.406285,0.418152,0.414030


## Test Code

In [None]:
lookback = 60
prediction_horizon = 7

def split_data_by_time(df, split_ratio=0.8):
    train_list, test_list = [], []
    
    grouped = df.groupby('Ticker')
    
    for ticker, group in grouped:
        group = group.sort_index()

        split_point = int(len(group) * split_ratio)
        
        # Split the data
        train_data = group.iloc[:split_point]
        test_data = group.iloc[split_point:]
        
        # Append to the overall lists
        train_list.append(train_data)
        test_list.append(test_data)
    
    # Concatenate all ticker groups back into DataFrames
    train_df = pd.concat(train_list)
    test_df = pd.concat(test_list)
    
    return train_df, test_df

# Step 2: Create LSTM dataset for training and testing data
def create_lstm_dataset(df, lookback, prediction_horizon):
    x_list, y_list = [], []

    grouped = df.groupby('Ticker')

    for ticker, group in grouped:
        group = group.sort_index()

        numeric_data = group.select_dtypes(include=[np.number]).values

        x, y = [], []
        for i in range(len(numeric_data) - lookback - prediction_horizon + 1):
            x.append(numeric_data[i:(i + lookback), :])  # Lookback period
            y.append(numeric_data[i + lookback:i + lookback + prediction_horizon, 0])  # Next 7 days of Close prices

        x = np.array(x)
        y = np.array(y)
        
        x_list.append(x)
        y_list.append(y)

    x = np.concatenate(x_list)
    y = np.concatenate(y_list)

    return x, y

# Step 1: Split the data
train_df, test_df = split_data_by_time(scaled_df, split_ratio=0.8)

# Step 2: Create LSTM datasets for training and testing
train_x, train_y = create_lstm_dataset(train_df, lookback, prediction_horizon)
test_x, test_y = create_lstm_dataset(test_df, lookback, prediction_horizon)

# Outputs
print("Training data shape (X, y):", train_x.shape, train_y.shape)
print("Testing data shape (X, y):", test_x.shape, test_y.shape)
