## Importing Dependencies

In [29]:
import numpy as np
import pandas as pd
import gc
from IPython.display import clear_output
from concurrent.futures import ProcessPoolExecutor, as_completed
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import time
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error

## Loading Dataset

In [2]:
scaled_df = pd.read_csv('10_Year_Historical_Scaled.csv').drop('Unnamed: 0', axis=1)

In [3]:
scaled_df

Unnamed: 0,Ticker,Date,Adj Close,EMA 20,EMA 50,MACD Line,MACD Signal
0,AAL,2014-10-20,0.505697,0.529674,0.544502,0.607034,0.610477
1,AAL,2014-10-21,0.557537,0.535072,0.546847,0.639224,0.617646
2,AAL,2014-10-22,0.541571,0.538294,0.548377,0.654164,0.626708
3,AAL,2014-10-23,0.569956,0.544164,0.551131,0.682956,0.640369
4,AAL,2014-10-24,0.596369,0.552225,0.554972,0.721051,0.659782
...,...,...,...,...,...,...,...
3221755,ZYXI,2024-10-14,0.308423,0.348974,0.407447,0.406825,0.412171
3221756,ZYXI,2024-10-15,0.307252,0.349241,0.407083,0.411746,0.412160
3221757,ZYXI,2024-10-16,0.304910,0.349227,0.406615,0.414378,0.412735
3221758,ZYXI,2024-10-17,0.307252,0.349470,0.406285,0.418152,0.414030


## Dataset Creation and Train-Test Split

In [4]:
def create_dataset(data, lookback, prediction_horizon):
    n_samples = len(data) - lookback - prediction_horizon
    if n_samples <= 0:
        return np.empty((0, lookback, data.shape[1] - 1)), np.empty((0, prediction_horizon))

    X = np.empty((n_samples, lookback, data.shape[1] - 1), dtype=np.float32)
    Y = np.empty((n_samples, prediction_horizon), dtype=np.float32)

    for i in range(n_samples):
        X[i] = data[i:(i + lookback), 1:]
        Y[i] = data[(i + lookback):(i + lookback + prediction_horizon), 0]

    return X, Y

In [5]:
def process_stock_data(df, lookback, prediction_horizon):
    tickers = df['Ticker'].unique()
    train_x_list, train_y_list, test_x_list, test_y_list = [], [], [], []

    total_train_samples, total_test_samples = 0, 0

    for ticker in tickers:
        stock_data = df[df['Ticker'] == ticker].copy()
        stock_data.drop(columns=['Ticker', 'Date'], inplace=True)

        train_split_len = int(len(stock_data) * 0.8)
        train_data = stock_data.values[:train_split_len]
        test_data = stock_data.values[train_split_len:]

        usable_train_samples = len(train_data) - lookback - prediction_horizon
        usable_test_samples = len(test_data) - lookback - prediction_horizon

        print(f"Ticker: {ticker}, Train rows: {len(train_data)}, Test rows: {len(test_data)}")
        print(f"Ticker: {ticker}, Usable train samples: {usable_train_samples}, Usable test samples: {usable_test_samples}")

        train_x, train_y = create_dataset(train_data, lookback, prediction_horizon)
        test_x, test_y = create_dataset(test_data, lookback, prediction_horizon)

        total_train_samples += train_x.shape[0]
        total_test_samples += test_x.shape[0]

        train_x_list.append(train_x)
        train_y_list.append(train_y)
        test_x_list.append(test_x)
        test_y_list.append(test_y)

        del train_data, test_data
        gc.collect()

    train_x = np.concatenate(train_x_list, axis=0)
    train_y = np.concatenate(train_y_list, axis=0)
    test_x = np.concatenate(test_x_list, axis=0)
    test_y = np.concatenate(test_y_list, axis=0)

    clear_output(wait=True)
    print(f"Total train samples: {total_train_samples}, Total test samples: {total_test_samples}")

    return train_x, train_y, test_x, test_y

In [6]:
lookback = 60
prediction_horizon = 7

train_x, train_y, test_x, test_y = process_stock_data(scaled_df, lookback, prediction_horizon)

gc.collect()

print(f"Training data shape: X={train_x.shape}, Y={train_y.shape}")
print(f"Testing data shape: X={test_x.shape}, Y={test_y.shape}")

Total train samples: 2490880, Total test samples: 559360
Training data shape: X=(2490880, 60, 4), Y=(2490880, 7)
Testing data shape: X=(559360, 60, 4), Y=(559360, 7)


## Model Building and Training

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
print(f"Device being used: {device}")

Device being used: cpu


In [9]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=32):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, prediction_horizon)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [10]:
train_x_tensor = torch.FloatTensor(train_x).to(device)
train_y_tensor = torch.FloatTensor(train_y).to(device)
test_x_tensor = torch.FloatTensor(test_x).to(device)
test_y_tensor = torch.FloatTensor(test_y).to(device)

In [11]:
input_size = train_x.shape[2]
model = LSTMModel(input_size=input_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
num_epochs = 5
batch_size = 32

In [13]:
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    num_batches = len(train_x_tensor) // batch_size

    with tqdm(total=num_batches, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for i in range(0, len(train_x_tensor), batch_size):
            optimizer.zero_grad()
            x_batch = train_x_tensor[i:i + batch_size]
            y_batch = train_y_tensor[i:i + batch_size]

            y_pred = model(x_batch)

            loss = criterion(y_pred, y_batch)
            epoch_loss += loss.item()

            loss.backward()
            optimizer.step()

            pbar.update(1)
            pbar.set_postfix(loss=loss.item())

    avg_epoch_loss = epoch_loss / num_batches
    elapsed_time = time.time() - start_time
    time_left = (elapsed_time / (epoch + 1)) * (num_epochs - (epoch + 1))

    print(f'Epoch [{epoch + 1}/{num_epochs}], Avg Loss: {avg_epoch_loss:.4f}, Time Elapsed: {elapsed_time:.2f}s, Estimated Time Left: {time_left:.2f}s')

Epoch 1/5: 100%|██████████| 77840/77840 [04:34<00:00, 283.68batch/s, loss=0.000394]


Epoch [1/5], Avg Loss: 0.0023, Time Elapsed: 274.39s, Estimated Time Left: 1097.57s


Epoch 2/5: 100%|██████████| 77840/77840 [04:34<00:00, 283.95batch/s, loss=0.000418]


Epoch [2/5], Avg Loss: 0.0018, Time Elapsed: 548.53s, Estimated Time Left: 822.79s


Epoch 3/5: 100%|██████████| 77840/77840 [04:35<00:00, 282.79batch/s, loss=0.000423]


Epoch [3/5], Avg Loss: 0.0017, Time Elapsed: 823.79s, Estimated Time Left: 549.19s


Epoch 4/5: 100%|██████████| 77840/77840 [04:35<00:00, 282.73batch/s, loss=0.000417]


Epoch [4/5], Avg Loss: 0.0016, Time Elapsed: 1099.11s, Estimated Time Left: 274.78s


Epoch 5/5: 100%|██████████| 77840/77840 [04:34<00:00, 283.56batch/s, loss=0.000422]

Epoch [5/5], Avg Loss: 0.0016, Time Elapsed: 1373.62s, Estimated Time Left: 0.00s





## Saving the model

In [17]:
torch.save(model.state_dict(), 'LSTM.pth')

## Loading the model

In [12]:
model.load_state_dict(torch.load('LSTM.pth', map_location=device))

  model.load_state_dict(torch.load('LSTM.pth', map_location=device))


<All keys matched successfully>

## Testing

In [25]:
def get_batched_predictions(model, test_x_tensor, batch_size):
    predictions = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(test_x_tensor), batch_size):
            x_batch = test_x_tensor[i:i + batch_size]
            batch_predictions = model(x_batch)
            predictions.append(batch_predictions.cpu().numpy())
            torch.cuda.empty_cache()
    return np.concatenate(predictions, axis=0)

In [26]:
test_predictions = get_batched_predictions(model, test_x_tensor, batch_size)

test_y_true = test_y_tensor.cpu().numpy()

if len(test_predictions.shape) > 1 and test_predictions.shape[1] > 1:
    test_predictions = test_predictions[:, 0]
    test_y_true = test_y_true[:, 0]

In [30]:
mse = mean_squared_error(test_y_true, test_predictions)
mae = mean_absolute_error(test_y_true, test_predictions)
rmse = root_mean_squared_error(test_y_true, test_predictions)

In [32]:
print("Evaluation Metrics")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean-Squared Error (RMSE): {rmse:.2f}")

Evaluation Metrics
Mean Squared Error (MSE): 0.0013
Mean Absolute Error (MAE): 0.0263
Root Mean-Squared Error (RMSE): 0.04
