<div align="center">

### Lab 4

# National Tsing Hua University

#### Spring 2025

#### 11320IEEM 513600

#### Deep Learning and Industrial Applications
    
## Lab 4: Predicting Stock Price with Deep Learning

</div>

### Introduction

In this lab, we explore the application of time-series datasets using Long Short-Term Memory (LSTM) networks, a type of recurrent neural network, to predict stock prices. Specifically, we will use historical price data from Nvidia to forecast the stock's price for the next day based on the prices of the previous N days. This approach is particularly relevant given the volatile nature of stock markets and the increasing reliance on automated trading systems.

### Objectives

- To understand the fundamentals of LSTM networks and their application in time-series forecasting.
- To develop a predictive model that can accurately forecast Nvidia's stock price for the next day using historical data.

### Dataset

The dataset for this lab is from the "Huge Stock Market Dataset" available on Kaggle. This dataset includes daily prices and volumes for all US stocks and ETFs, with a specific focus on Nvidia (NVDA). The dataset features include:

- **Date**: The recorded data points.
- **Open**: The price at which the stock first traded upon the opening of an exchange on a given trading day.
- **High**: The highest price at which the stock traded during the trading day.
- **Low**: The lowest price at which the stock traded during the trading day.
- **Close**: The price of the stock at closing time.
- **Volume**: The number of shares or contracts traded in a security or an entire market during a given period.
- **OpenInt**: The total number of outstanding derivative contracts, like options or futures. [More details here](https://www.kaggle.com/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs/discussion/121096)

### References

- [Huge Stock Market Dataset](https://www.kaggle.com/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs) for the dataset used in this lab.


## A. Checking and Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/lab4/nvda.us.txt')
df

In [None]:
plot = df.plot('Date', 'High', figsize=(15, 5))

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# checking for null values
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.shape

In [None]:
df.describe()

#### Converting the DataFrame to a NumPy Array

In [None]:
def create_sequences(input_data, output_data, window_size, step):
    sequences = []
    labels = []
    for i in range(0, len(input_data) - window_size, step):
        sequences.append(input_data[i:(i + window_size)])
        labels.append(output_data[i + window_size])
    return np.array(sequences), np.array(labels)

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

# Select features
features = df[['Open', 'High', 'Low', 'Close']]
labels = df['High'].shift(-1)  # Next day's high price as label

X, y = create_sequences(features, labels, window_size=10, step=15)

print(f'Shape of data X: {X.shape}')
print(f'Shape of data y: {y.shape}')

# split the hold-out tests
ind = np.linspace(0, len(X)-1, num=int(len(X)*0.1), dtype=int) # 10% hold-out
x_test = X[ind]
y_test = y[ind]
all_ind = np.arange(len(X))
remains_ind = np.delete(all_ind, ind)

X = X[remains_ind]
y = y[remains_ind]

# shuffle dataset
ind = np.random.permutation(len(X))
X = X[ind]
y = y[ind]
split_point = int(X.shape[0]*0.8)

x_train = X[:split_point]
y_train = y[:split_point]
x_val = X[split_point:]
y_val = y[split_point:]

print(f'Shape of data x_train: {x_train.shape}')
print(f'Shape of data y_train: {y_train.shape}')
print(f'Shape of data x_val: {x_val.shape}')
print(f'Shape of data y_val: {y_val.shape}')
print(f'Shape of data x_test: {x_test.shape}')
print(f'Shape of data y_test: {y_test.shape}')

# Convert to PyTorch tensors
x_train = torch.from_numpy(x_train).float()
y_train = torch.from_numpy(y_train).float()

x_val = torch.from_numpy(x_val).float()
y_val = torch.from_numpy(y_val).float()

x_test = torch.from_numpy(x_test).float()
y_test = torch.from_numpy(y_test).float()

batch_size = 32

# Create datasets
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
test_dataset = TensorDataset(x_test, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f'Number of samples in training and validation are {len(train_loader.dataset)} and {len(val_loader.dataset)}.')

## B. Defining Neural Networks

In [None]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

## C. Training the Neural Network

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_losses = []
val_losses = []

epochs = 1

model = LSTMModel(input_dim=4, hidden_dim=500, num_layers=2, output_dim=1).to(device)
print(model)

best_val_loss = float('inf')

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0)

for epoch in tqdm(range(epochs)):
    # Training
    model.train()
    total_loss = 0.0

    for features, labels in train_loader:
        features = features.to(device)
        labels = labels.to(device)
        outputs = model(features).squeeze(-1)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Learning rate update
    lr_scheduler.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for features, labels in val_loader:
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features).squeeze(-1)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    # Checkpoint
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')

    print(f'Epoch {epoch+1}/{epochs}, Train loss: {avg_train_loss:.4f}, Val loss: {avg_val_loss:.4f}, Best Val loss: {best_val_loss:.4f}')

    # Store performance
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

#### Visualizing the model performance

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses)
plt.plot(val_losses)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train', 'Val'])

plt.show()

## D. Evaluating Your Trained Model

In [None]:
# Load the trained weights
model.load_state_dict(torch.load('best_model.pth'))

# Set the model to evaluation mode
model.eval()

test_correct = 0
test_total = 0

pred_value = []
actual_value = []

with torch.no_grad():
    for features, labels in test_loader:
        features = features.to(device)
        outputs = model(features).squeeze(-1)
        pred_value.append(outputs.cpu())
        actual_value.append(labels)

pred_value = torch.cat(pred_value)
actual_value = torch.cat(actual_value)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(pred_value[:])
plt.plot(actual_value[:])
plt.xlabel('Day')
plt.ylabel('Price')
plt.legend(['Pred', 'Actual'])

plt.show()

## Homework

### Q1

In [None]:
def data_pipeline(window_size=10, step=15):
    features = df[['Open', 'High', 'Low', 'Close']]
    labels = df['High'].shift(-1)  # Next day's high price as label

    X, y = create_sequences(features, labels, window_size=window_size, step=step)

    # split the hold-out tests
    ind = np.linspace(0, len(X)-1, num=int(len(X)*0.1), dtype=int) # 10% hold-out
    x_test = X[ind]
    y_test = y[ind]
    all_ind = np.arange(len(X))
    remains_ind = np.delete(all_ind, ind)

    X = X[remains_ind]
    y = y[remains_ind]

    # shuffle dataset
    ind = np.random.permutation(len(X))
    X = X[ind]
    y = y[ind]
    split_point = int(X.shape[0]*0.8)

    x_train = X[:split_point]
    y_train = y[:split_point]
    x_val = X[split_point:]
    y_val = y[split_point:]

    # Convert to PyTorch tensors
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).float()

    x_val = torch.from_numpy(x_val).float()
    y_val = torch.from_numpy(y_val).float()

    x_test = torch.from_numpy(x_test).float()
    y_test = torch.from_numpy(y_test).float()

    batch_size = 32

    # Create datasets
    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)
    test_dataset = TensorDataset(x_test, y_test)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = LSTMModel(input_dim=4, hidden_dim=500, num_layers=2, output_dim=1).to(device)

def Trainer(train_loader, val_loader, model, device, epochs=100):
    best_val_loss = float('inf')

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0)

    for epoch in tqdm(range(epochs)):
        # Training
        model.train()
        total_loss = 0.0

        for features, labels in train_loader:
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features).squeeze(-1)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Learning rate update
        lr_scheduler.step()

        avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for features, labels in val_loader:
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features).squeeze(-1)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)


    # Checkpoint
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')

    # Store performance
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    model.load_state_dict(torch.load('best_model.pth'))

def Tester(test_loader, model, device):
    model.eval()

    test_correct = 0
    test_total = 0

    pred_value = []
    actual_value = []

    with torch.no_grad():
        for features, labels in test_loader:
            features = features.to(device)
            outputs = model(features).squeeze(-1)
            pred_value.append(outputs.cpu())
            actual_value.append(labels)

    pred_value = torch.cat(pred_value).cpu().detach().numpy()
    actual_value = torch.cat(actual_value).cpu().detach().numpy()
    return pred_value, actual_value

In [None]:
def hw1(window_size, step):
    train_loader, val_loader, test_loader = data_pipeline(window_size=window_size, step=step)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMModel(input_dim=4, hidden_dim=500, num_layers=2, output_dim=1).to(device)
    Trainer(train_loader, val_loader, model, device)
    # pred_value, actual_value = Tester(test_loader, model, device)
    return Tester(test_loader, model, device)

In [None]:
exp_params = [(5, 5), (5, 10), (10, 5)]
mse = []
pred = []
for window_size, step in exp_params:
    pred_value, actual_value = hw1(window_size, step)
    pred.append(pred_value)
    mse.append(np.mean((actual_value-pred_value)**2))


In [None]:
for i in range(3):
    print(f"Window Size: {exp_params[i][0]}, Step: {exp_params[i][1]}; **MSE** = {mse[i]}")
# for i in range(3):
#     plt.plot(pred[i][:], label=f"{exp_params[i]}")

# plt.plot(actual_value[:], label="ground turth")
# plt.legend()
# plt.show()

In [None]:
pred_value, actual_value = hw1(10, 15)
mse.append(np.mean((actual_value-pred_value)**2))

### Q2

In [None]:
def data_pipeline(window_size=10, step=15):
    features = df[['Open', 'Close', 'High', 'Low']]
    labels = df['High'].shift(-1)  # Next day's high price as label

    X, y = create_sequences(features, labels, window_size=window_size, step=step)

    # split the hold-out tests
    ind = np.linspace(0, len(X)-1, num=int(len(X)*0.1), dtype=int) # 10% hold-out
    x_test = X[ind]
    y_test = y[ind]
    all_ind = np.arange(len(X))
    remains_ind = np.delete(all_ind, ind)

    X = X[remains_ind]
    y = y[remains_ind]

    # shuffle dataset
    ind = np.random.permutation(len(X))
    X = X[ind]
    y = y[ind]
    split_point = int(X.shape[0]*0.8)

    x_train = X[:split_point]
    y_train = y[:split_point]
    x_val = X[split_point:]
    y_val = y[split_point:]

    # Convert to PyTorch tensors
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).float()

    x_val = torch.from_numpy(x_val).float()
    y_val = torch.from_numpy(y_val).float()

    x_test = torch.from_numpy(x_test).float()
    y_test = torch.from_numpy(y_test).float()

    batch_size = 32

    # Create datasets
    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)
    test_dataset = TensorDataset(x_test, y_test)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

def hw2(window_size, step):
    train_loader, val_loader, test_loader = data_pipeline(window_size=window_size, step=step)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMModel(input_dim=4, hidden_dim=500, num_layers=2, output_dim=1).to(device)
    Trainer(train_loader, val_loader, model, device)
    # pred_value, actual_value = Tester(test_loader, model, device)
    return Tester(test_loader, model, device)

In [None]:
exp_params = [(5, 5), (5, 3), (5, 2)]
mse = []
pred = []
count = 0
for window_size, step in exp_params:
    pred_value, actual_value = hw2(window_size, step)
    pred.append(pred_value)
    mse.append(np.mean((actual_value-pred_value)**2))
    print(f"Window Size: {exp_params[count][0]}, Step: {exp_params[count][1]}; **MSE** = {mse[count]}")
    count += 1

In [None]:
exp_params = [(3, 5), (3, 3), (3, 2)]
mse = []
pred = []
count = 0
for window_size, step in exp_params:
    pred_value, actual_value = hw2(window_size, step)
    pred.append(pred_value)
    mse.append(np.mean((actual_value-pred_value)**2))
    print(f"Window Size: {exp_params[count][0]}, Step: {exp_params[count][1]}; **MSE** = {mse[count]}")
    count += 1

### Q3

In [None]:
df = df.drop(columns=['Date', 'OpenInt'])

In [None]:
def data_pipeline(window_size=10, step=15):
    normalized_df = (df - df.mean()) / df.std()
    features = normalized_df[['Open', 'Close', 'High', 'Low']]
    labels = normalized_df['High'].shift(-1)  # Next day's high price as label

    X, y = create_sequences(features, labels, window_size=window_size, step=step)

    # split the hold-out tests
    ind = np.linspace(0, len(X)-1, num=int(len(X)*0.1), dtype=int) # 10% hold-out
    x_test = X[ind]
    y_test = y[ind]
    all_ind = np.arange(len(X))
    remains_ind = np.delete(all_ind, ind)

    X = X[remains_ind]
    y = y[remains_ind]

    # shuffle dataset
    ind = np.random.permutation(len(X))
    X = X[ind]
    y = y[ind]
    split_point = int(X.shape[0]*0.8)

    x_train = X[:split_point]
    y_train = y[:split_point]
    x_val = X[split_point:]
    y_val = y[split_point:]

    # Convert to PyTorch tensors
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).float()

    x_val = torch.from_numpy(x_val).float()
    y_val = torch.from_numpy(y_val).float()

    x_test = torch.from_numpy(x_test).float()
    y_test = torch.from_numpy(y_test).float()

    batch_size = 32

    # Create datasets
    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)
    test_dataset = TensorDataset(x_test, y_test)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

def hw3(window_size, step):
    train_loader, val_loader, test_loader = data_pipeline(window_size=window_size, step=step)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMModel(input_dim=4, hidden_dim=500, num_layers=2, output_dim=1).to(device)
    Trainer(train_loader, val_loader, model, device)
    # pred_value, actual_value = Tester(test_loader, model, device)
    return Tester(test_loader, model, device)

In [None]:
exp_params = [(5, 3), (5, 2), (3, 2)]
mse = []
pred = []
count = 0
for window_size, step in exp_params:
    pred_value, actual_value = hw3(window_size, step)
    pred_value = pred_value * df.std()['High'] + df.mean()['High']
    actual_value = actual_value * df.std()['High'] + df.mean()['High']
    pred.append(pred_value)
    mse.append(np.mean((actual_value-pred_value)**2))
    print(f"Window Size: {exp_params[count][0]}, Step: {exp_params[count][1]}; **MSE** = {mse[count]}")
    count += 1

In [None]:
def data_pipeline(window_size=10, step=15):
    normalized_df = (df - df.mean()) / df.std()
    features = normalized_df[['Open', 'Close', 'High', 'Low', 'Volume']]
    labels = normalized_df['High'].shift(-1)  # Next day's high price as label

    X, y = create_sequences(features, labels, window_size=window_size, step=step)

    # split the hold-out tests
    ind = np.linspace(0, len(X)-1, num=int(len(X)*0.1), dtype=int) # 10% hold-out
    x_test = X[ind]
    y_test = y[ind]
    all_ind = np.arange(len(X))
    remains_ind = np.delete(all_ind, ind)

    X = X[remains_ind]
    y = y[remains_ind]

    # shuffle dataset
    ind = np.random.permutation(len(X))
    X = X[ind]
    y = y[ind]
    split_point = int(X.shape[0]*0.8)

    x_train = X[:split_point]
    y_train = y[:split_point]
    x_val = X[split_point:]
    y_val = y[split_point:]

    # Convert to PyTorch tensors
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).float()

    x_val = torch.from_numpy(x_val).float()
    y_val = torch.from_numpy(y_val).float()

    x_test = torch.from_numpy(x_test).float()
    y_test = torch.from_numpy(y_test).float()

    batch_size = 32

    # Create datasets
    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)
    test_dataset = TensorDataset(x_test, y_test)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

def hw3_2(window_size, step):
    train_loader, val_loader, test_loader = data_pipeline(window_size=window_size, step=step)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMModel(input_dim=5, hidden_dim=500, num_layers=2, output_dim=1).to(device)
    Trainer(train_loader, val_loader, model, device)
    # pred_value, actual_value = Tester(test_loader, model, device)
    return Tester(test_loader, model, device)

In [None]:
exp_params = [(5, 3), (5, 2), (3, 2)]
mse = []
pred = []
count = 0
for window_size, step in exp_params:
    pred_value, actual_value = hw3_2(window_size, step)
    pred_value = pred_value * df.std()['High'] + df.mean()['High']
    actual_value = actual_value * df.std()['High'] + df.mean()['High']
    pred.append(pred_value)
    mse.append(np.mean((actual_value-pred_value)**2))
    print(f"Window Size: {exp_params[count][0]}, Step: {exp_params[count][1]}; **MSE** = {mse[count]}")
    count += 1