### Stock Market Prediction using Graph Neural Networks

In [1]:
from dataset.data import *
from models.gnn import *
from datetime import datetime

### Data Collection

In [None]:
stock_path = '/Users/vivek/.cache/kagglehub/datasets/jacksoncrow/stock-market-dataset/versions/2/stocks'
meta_path  = '/Users/vivek/.cache/kagglehub/datasets/jacksoncrow/stock-market-dataset/versions/2/symbols_valid_meta.csv'
nasdaq100_path = '/Users/vivek/.cache/kagglehub/datasets/jacksoncrow/stock-market-dataset/versions/2/nasdaq_100.csv'

stock_data = NASDAQDataset(stock_path, meta_path, nasdaq100_path, "2014-01-01") # Collect all data after 2014-01-01
stock_data.read_data()

# Generate graphs for each day from the 30th data point (we need a minimum of 30 days to compute correlations)

for date in tqdm(stock_data.data['AAPL'].index[30:]):
    G = stock_data.daily_graph_generator(date, threshold = 1, window_size = 10)


In [3]:
import random

# Read the gml files and create PyG data objects (required only once)

data_path = '/Users/vivek/Documents/PhD/UIUC/Fall24/CS598/Project/GNN-Stock-Market-Prediction/dataset/graphs'
data_list = [os.path.join(data_path, file) for file in os.listdir(data_path)]

# Select 1000 random elements from data_list

train_list = random.sample(data_list, 1000)
test_list = [file for file in data_list if file not in train_list]

train_loader = gnn_data_obj(train_list) # train on first 1000 graphs
test_loader = gnn_data_obj(test_list) # test on remaining graphs

100%|██████████| 1000/1000 [00:13<00:00, 71.71it/s]
100%|██████████| 541/541 [00:07<00:00, 73.02it/s]


In [None]:
# # Save the train and test datasets

# save_path = '/Users/vivek/Documents/PhD/UIUC/Fall24/CS598/Project/GNN-Stock-Market-Prediction/dataset/processed_dir/10/'
# torch.save(train_loader, os.path.join(save_path, 'train_10_1.pt'))
# torch.save(test_loader, os.path.join(save_path, 'test_10_1.pt'))

In [9]:
# Load the processed data (saves time)

load_train_path = '/Users/vivek/Documents/PhD/UIUC/Fall24/CS598/Project/GNN-Stock-Market-Prediction/dataset/processed_dir/10/train_10_1.pt'
load_test_path = '/Users/vivek/Documents/PhD/UIUC/Fall24/CS598/Project/GNN-Stock-Market-Prediction/dataset/processed_dir/10/test_10_1.pt'

# Load the list of Data objects

train_loader = torch.load(load_train_path)
test_loader = torch.load(load_test_path)

In [10]:
class StockGAT(torch.nn.Module):

    # A 3-layer GAT for node regression
    
    def __init__(self):
        
        super(StockGAT, self).__init__()

        self.conv1 = GATConv(20, 64, heads = 2, concat= False, dropout = 0.3)
        self.conv2 = GATConv(64, 1, heads = 2, concat = False, dropout = 0.3)
        # self.conv3 = GATConv(32, 1, heads = 2, concat = False, dropout = 0.3)        
    
    def forward(self, data):

        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr

        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = F.dropout(x, p = 0.3, training=self.training)
        
        x = self.conv2(x, edge_index)
        # x = torch.relu(x)
        # x = F.dropout(x, p = 0.3, training=self.training)
        
        # x = self.conv3(x, edge_index)

        return x

In [11]:
# # Check if there are edges other than self loops

# count = 0

# for data in train_loader:
    
#     G = nx.read_gml(data)

#     if len(G.edges) > len(G.nodes):
#         print(data)
#         count += 1

### Training

In [12]:
# Initialize the StockGNN model and optimizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

stock_gcn_model = StockGCN().to(device)
gcn_optimizer = torch.optim.Adam(stock_gcn_model.parameters(), lr=0.001, weight_decay = 1e-4)

stock_gat_model = StockGAT().to(device)
gat_optimizer = torch.optim.Adam(stock_gat_model.parameters(), lr=0.0008, weight_decay =1e-4)

stock_graph_sage_model = StockGraphSAGE().to(device)
graph_sage_optimizer = torch.optim.Adam(stock_graph_sage_model.parameters(), lr=0.001, weight_decay = 1e-4)

In [13]:

mape, rmse = eval_baseline(test_loader)



print(f'MAPE Linear Regression: {mape[0] * 100}')
print(f'MAPE Moving Average: {mape[1] * 100}')
print(f'MAPE Exponential Smoothing: {mape[2] * 100}')
print(f'MAPE Holt-Winters: {mape[3] * 100}')

print("*"*50)

print(f'RMSE Linear Regression: {rmse[0]}')
print(f'RMSE Moving Average: {rmse[1]}')
print(f'RMSE Exponential Smoothing: {rmse[2]}')
print(f'RMSE Holt-Winters: {rmse[3]}')

MAPE Linear Regression: 1.9765061990817572
MAPE Moving Average: 1.999117126395709
MAPE Exponential Smoothing: 1.8946309610960281
MAPE Holt-Winters: 1.844034975053753
**************************************************
RMSE Linear Regression: 6.757386684417725
RMSE Moving Average: 6.753388404846191
RMSE Exponential Smoothing: 6.342061996459961
RMSE Holt-Winters: 6.2757368087768555


In [None]:
# TRAIN!

train_loss, test_mape, test_rmse = train(stock_gcn_model, gcn_optimizer, train_loader, test_loader, 300)

In [None]:
train_loss_gat, test_mape_gat, test_rmse_gat = train(stock_gat_model, gat_optimizer, train_loader, test_loader, 300)



In [None]:
train_loss_graph_sage, test_mape_graph_sage, test_rmse_graph_sage = train(stock_graph_sage_model, graph_sage_optimizer, train_loader, test_loader, 300)

In [None]:
# Plot the validation MAPE

plt.title('Validation MAPE')
# plt.plot(test_mape, label = 'GCN')
plt.plot(test_mape_gat, label = 'GAT')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Percentage Error')
plt.show()