In [60]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import torch.nn as nn

In [62]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [64]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,name,percent_movement
0,0,1999-11-18,26.511736,31.473534,35.765381,28.612303,32.546494,62546380.0,A,-3.296699
1,1,1999-11-19,24.327528,28.880545,30.758226,28.478184,30.713518,15234146.0,A,-5.96797
2,2,1999-11-22,26.511736,31.473534,31.473534,28.657009,29.551144,6577870.0,A,6.505298
3,3,1999-11-23,24.101582,28.612303,31.205294,28.612303,30.400572,5975611.0,A,-5.882353
4,4,1999-11-24,24.741781,29.372318,29.998213,28.612303,28.701717,4843231.0,A,2.336449


In [65]:
# Create a mapping of stock names to numerical IDs
stock_names = df['name'].unique()
stock_to_id = {name: idx for idx, name in enumerate(stock_names)}

# Add stock ID column to your dataframe
df['stock_id'] = df['name'].map(stock_to_id)

In [66]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,name,percent_movement,stock_id
0,0,1999-11-18,26.511736,31.473534,35.765381,28.612303,32.546494,62546380.0,A,-3.296699,0
1,1,1999-11-19,24.327528,28.880545,30.758226,28.478184,30.713518,15234146.0,A,-5.96797,0
2,2,1999-11-22,26.511736,31.473534,31.473534,28.657009,29.551144,6577870.0,A,6.505298,0
3,3,1999-11-23,24.101582,28.612303,31.205294,28.612303,30.400572,5975611.0,A,-5.882353,0
4,4,1999-11-24,24.741781,29.372318,29.998213,28.612303,28.701717,4843231.0,A,2.336449,0


In [72]:
class StockGRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, max_horizon=30):
        super(StockGRUModel, self).__init__()
        self.hidden_size = hidden_size
        
        # GRU layers
        self.gru = nn.GRU(input_size, hidden_size, 
                          num_layers=2, batch_first=True, dropout=0.2)
        
        # Multiple output heads for different prediction horizons
        self.output_heads = nn.ModuleList([
            nn.Linear(hidden_size, 1) for _ in range(max_horizon)
        ])
        
    def forward(self, x, horizon):
        # Forward propagate GRU
        h0 = torch.zeros(2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        
        # Get the final time step
        out = out[:, -1, :]
        
        # Select the appropriate output head based on horizon
        # (clamping to ensure we don't exceed the number of heads)
        horizon_idx = min(horizon-1, len(self.output_heads)-1)
        output = self.output_heads[horizon_idx](out)
        
        return output

Using date, percentage and stock id as primary features. depending on performance it will be adjusted

In [75]:
# Since it is one big dataset we can't just do sequential split. We have to split seperately and concat them back together.

# Sort by stock_id and date
df['Date'] = pd.to_datetime(df['Date'])  # make sure it's datetime
df = df.sort_values(by=['stock_id', 'Date'])

train_dfs = []
test_dfs = []

# Group by each stock
for stock_id, stock_df in df.groupby('stock_id'):
    stock_df = stock_df.sort_values('Date')  # make sure it's sorted

    # Find the index to split
    split_idx = int(len(stock_df) * 0.8)

    train_dfs.append(stock_df.iloc[:split_idx])
    test_dfs.append(stock_df.iloc[split_idx:])

train_df = pd.concat(train_dfs).reset_index(drop=True)
test_df = pd.concat(test_dfs).reset_index(drop=True)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (14707061, 11)
Test shape: (3680161, 11)


### Altough it doesn't make sense use our target column percent_movement as a feature too, as the methodolgy states we need to use as many features as possible which also includes everything at first. we can make another model after without it for comparison (which is the point of the research)

In [80]:
feature_columns = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'percent_movement']

X_train = train_df[feature_columns].values
y_train = train_df[target_column].values

X_test = test_df[feature_columns].values
y_test = test_df[target_column].values


In [82]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,name,percent_movement,stock_id
0,0,1999-11-18,26.511736,31.473534,35.765381,28.612303,32.546494,62546380.0,A,-3.296699,0
1,1,1999-11-19,24.327528,28.880545,30.758226,28.478184,30.713518,15234146.0,A,-5.96797,0
2,2,1999-11-22,26.511736,31.473534,31.473534,28.657009,29.551144,6577870.0,A,6.505298,0
3,3,1999-11-23,24.101582,28.612303,31.205294,28.612303,30.400572,5975611.0,A,-5.882353,0
4,4,1999-11-24,24.741781,29.372318,29.998213,28.612303,28.701717,4843231.0,A,2.336449,0
