## Setup the Env


In [1]:
import os
import sys

import sagemaker
from sagemaker import get_execution_role

# Add the parent directory to the sys.path
sys.path.insert(0, os.path.abspath(".."))

# Define IAM role
role = get_execution_role()
role

# Establish S3 bucket connection
import boto3

s3 = boto3.client("s3")
bucket = "capstone-bucket-4-friends"

# Take a look at current dir
print(os.getcwd())

from file_utilities import s3_download

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
/home/sagemaker-user/capstone-2024-summer/src/shuo


## Import Package

In [2]:
!pip install seaborn
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import joblib

warnings.filterwarnings("ignore")

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


## Load Data

In [3]:
class FinancialDataset(Dataset):
    def __init__(self, data, window_size=128, one_hot_encoder=None):
        self.data = data
        self.window_size = window_size
        self.input_columns = [
            "log_return",
            "sector_weighted_avg_log_return",
            "sector_simple_avg_log_return",
            "weighted_avg_log_return",
            "simple_avg_log_return",
        ]
        self.target_columns = [
            "log_return",
            "abs_log_return",
            "log_return_7",
            "abs_log_return_7",
            "log_return_14",
            "abs_log_return_14",
            "log_return_28",
            "abs_log_return_28",
            "volatility_7",
            "volatility_14",
            "volatility_28",
        ]

        self.data[self.input_columns] = self.data[self.input_columns].apply(pd.to_numeric, errors="coerce")
        self.data[self.target_columns] = self.data[self.target_columns].apply(pd.to_numeric, errors="coerce")
        self.data = self.data.dropna()

        self.one_hot_encoder = one_hot_encoder
        if self.one_hot_encoder is None:
            self.one_hot_encoder = OneHotEncoder(sparse_output=False)
            self.data["permno_id"] = self.data["permno_id"].astype(str)
            self.permno_one_hot = self.one_hot_encoder.fit_transform(self.data[["permno_id"]])
        else:
            self.data["permno_id"] = self.data["permno_id"].astype(str)
            self.permno_one_hot = self.one_hot_encoder.transform(self.data[["permno_id"]])

        # Assign positional encoding
        self.data['positional_encoding'] = self.data.groupby("permno_id").cumcount()
        
        # Group by permno_id
        self.groups = self.data.groupby("permno_id")
        self.sequences = []

        for name, group in self.groups:
            permno_id_one_hot = self.one_hot_encoder.transform(group[["permno_id"]])
            if len(group) > self.window_size:
                for i in range(len(group) - self.window_size):
                    X = group[self.input_columns].iloc[i : i + self.window_size].values
                    # print(X)
                    # permno_id_one_hot_repeated = np.tile(permno_id_one_hot[i], (self.window_size, 1))
                    # print(permno_id_one_hot_repeated.shape)
                    # positional_encoding = np.arange(self.window_size).reshape(-1, 1)
                    # print(positional_encoding.shape)
                    X = np.concatenate([X, permno_id_one_hot[i:i + self.window_size]], axis=1)  # Concatenate one-hot encoded permno_id
                    X = np.concatenate([X, group['positional_encoding'].iloc[i: i + self.window_size].values.reshape(-1, 1)], axis=1)
                    # X = np.concatenate([X, permno_id_one_hot_repeated, positional_encoding], axis=1)
                    # print(X.shape)
                    y = group[self.target_columns].iloc[i + self.window_size].values
                    self.sequences.append((X, y, name))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        X, y, permno_id = self.sequences[idx]
        return {
            "features": torch.tensor(X, dtype=torch.float),
            "target": torch.tensor(y, dtype=torch.float),
            "permno_id": permno_id,
        }

In [4]:
%%time
# Load datasets
train_dataset = joblib.load('/home/sagemaker-user/capstone-2024-summer/data/crsp_2018-2023_S&P500_train_dataset.pkl')
val_dataset = joblib.load('/home/sagemaker-user/capstone-2024-summer/data/crsp_2018-2023_S&P500_val_dataset.pkl')

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

CPU times: user 3min 50s, sys: 5min 48s, total: 9min 38s
Wall time: 53min 13s


In [7]:
# Load the OneHotEncoder if needed for further processing
one_hot_encoder = joblib.load('/home/sagemaker-user/capstone-2024-summer/data/one_hot_encoder.pkl')

## Build and Train Model

In [5]:
%%time
class TransformerModel(nn.Module):
    def __init__(
        self,
        input_dim,
        embed_dim,
        n_heads,
        ff_dim,
        n_layers,
        output_dim,
        hidden_dims=[256, 128],
        dropout=0.25,
        attn_dropout=0.1,
        residual_dropout=0.1,
    ):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=attn_dropout
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.hidden_layers = nn.Sequential(
            nn.Linear(embed_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.fc = nn.Linear(hidden_dims[1], output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(0, 1)  # Transformer expects input of shape (seq_len, batch_size, embed_dim))
        x = self.transformer(x)
        embedding = x[-1, :, :]  # Get the last output token
        x = self.hidden_layers(embedding)
        x = self.fc(x)
        return x, embedding

CPU times: user 21 µs, sys: 0 ns, total: 21 µs
Wall time: 22.4 µs


In [6]:
# Initialize Model
# Update input_dim to include one-hot encoded permno_id length
input_dim = len(train_dataset.input_columns) + len(one_hot_encoder.categories_[0]) + 1
model = TransformerModel(
    input_dim=input_dim,
    embed_dim=256,
    n_heads=16,
    ff_dim=1024,
    n_layers=6,
    output_dim=len(train_dataset.target_columns),
    dropout=0.25,
    attn_dropout=0.1,
    residual_dropout=0.1,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
optimizer = torch.optim.RAdam(model.parameters(), lr=0.00005)
criterion = nn.L1Loss()

train_losses = []
val_losses = []

num_epochs = 10
patience = 5
best_val_loss = float("inf")
patience_counter = 0

print("Starting Training...")
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        output, _ = model(batch["features"].to(device))
        loss = criterion(output, batch["target"].to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_losses.append(running_loss / len(train_dataloader))

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            output, _ = model(batch["features"].to(device))
            loss = criterion(output, batch["target"].to(device))
            val_loss += loss.item()
    val_losses.append(val_loss / len(val_dataloader))

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {train_losses[-1]}, Validation Loss: {val_losses[-1]}")

    if val_losses[-1] < best_val_loss:
        best_val_loss = val_losses[-1]
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

print("Training Complete")

# Plot the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()


NameError: name 'one_hot_encoder' is not defined

## Stock Embedding

In [33]:
# Extract embeddings for each stock
stock_embeddings = {}
model.eval()
with torch.no_grad():
    for batch in train_dataloader:
        _, embeddings = model(batch["features"].to(device))
        for i, permno_id in enumerate(batch["permno_id"]):
            if permno_id not in stock_embeddings:
                stock_embeddings[permno_id] = []
            stock_embeddings[permno_id].append(embeddings[i].cpu().numpy())

In [34]:
# Average the embeddings for each stock
for permno_id in stock_embeddings:
    stock_embeddings[permno_id] = np.mean(stock_embeddings[permno_id], axis=0)

In [35]:
list(stock_embeddings.items())[:2]

[('10104',
  array([ 8.4816766e-01, -1.5194374e+00,  5.9012413e-01, -5.6201398e-01,
          7.6335293e-01, -4.5656464e-01, -5.3003037e-01, -9.2574376e-01,
         -1.2043912e+00,  6.4523458e-01,  3.3464134e-01, -1.7061759e+00,
         -1.2175984e+00,  1.2132494e+00, -2.8131682e-02,  2.1662381e+00,
          3.7906647e-01,  1.0697567e+00,  1.1021727e+00, -7.3253524e-01,
          1.1026136e+00,  8.9342844e-01,  1.4283441e+00, -1.9291799e-01,
         -4.1078594e-01,  1.1794630e+00, -4.8431557e-01, -4.5568582e-01,
          1.1761404e+00, -2.2393908e-01,  1.6652354e+00, -6.9927013e-01,
          3.2872522e-01,  5.7171303e-01, -8.5946906e-01, -1.0537200e+00,
         -1.5642016e+00, -1.5888990e+00, -9.4908351e-01, -1.5516607e+00,
         -4.6480864e-02, -1.4805536e+00,  6.1790133e-01,  4.9487892e-01,
          4.1143429e-01,  1.0928814e+00, -1.6546495e+00,  8.0822176e-01,
          6.0433197e-01,  1.5548137e+00, -4.1606587e-01,  2.2853872e-01,
         -1.0394421e+00, -3.9808327e-01,

## Cosine Similarity Matrix

In [36]:
# Assuming stock_embeddings is a dictionary with permno_id as keys and embeddings as values
embeddings_list = []
permno_ids = []

for permno_id, embedding in stock_embeddings.items():
    permno_ids.append(permno_id)
    embeddings_list.append(embedding)

# Convert the list of embeddings to a numpy array
embeddings_array = np.array(embeddings_list)

# Transform embeddings using cosine similarity
cosine_sim_matrix = cosine_similarity(embeddings_array)
cosine_sim_matrix

array([[1.]], dtype=float32)

## Model Features Check

In [37]:
for batch in train_dataloader:
    print(batch)
    break

{'features': tensor([[[-1.3843e-02,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.2897e-02,  6.4040e-03,  4.8720e-03,  ...,  0.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 9.8028e-03,  4.0480e-03,  9.6000e-04,  ...,  0.0000e+00,
           0.0000e+00,  2.0000e+00],
         ...,
         [ 1.0467e-02,  2.1420e-03,  5.3540e-03,  ...,  0.0000e+00,
           0.0000e+00,  4.0000e+00],
         [ 5.4969e-03, -2.6340e-03, -2.7040e-03,  ...,  0.0000e+00,
           0.0000e+00,  5.0000e+00],
         [-5.3141e-03, -2.4190e-03, -1.9280e-03,  ...,  0.0000e+00,
           0.0000e+00,  6.0000e+00]],

        [[ 2.2897e-02,  6.4040e-03,  4.8720e-03,  ...,  0.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 9.8028e-03,  4.0480e-03,  9.6000e-04,  ...,  0.0000e+00,
           0.0000e+00,  2.0000e+00],
         [ 6.0010e-03,  9.5810e-03,  6.7220e-03,  ...,  0.0000e+00,
           0.0000e+00,  3.0000e+00],
         ...,
         [ 5

In [38]:
batch["features"].shape

torch.Size([32, 7, 560])

In [39]:
batch["target"].shape

torch.Size([32, 11])

In [40]:
len(batch["permno_id"])

32

In [41]:
batch["features"][:5]

tensor([[[-1.3843e-02,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.2897e-02,  6.4040e-03,  4.8720e-03,  ...,  0.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 9.8028e-03,  4.0480e-03,  9.6000e-04,  ...,  0.0000e+00,
           0.0000e+00,  2.0000e+00],
         ...,
         [ 1.0467e-02,  2.1420e-03,  5.3540e-03,  ...,  0.0000e+00,
           0.0000e+00,  4.0000e+00],
         [ 5.4969e-03, -2.6340e-03, -2.7040e-03,  ...,  0.0000e+00,
           0.0000e+00,  5.0000e+00],
         [-5.3141e-03, -2.4190e-03, -1.9280e-03,  ...,  0.0000e+00,
           0.0000e+00,  6.0000e+00]],

        [[ 2.2897e-02,  6.4040e-03,  4.8720e-03,  ...,  0.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 9.8028e-03,  4.0480e-03,  9.6000e-04,  ...,  0.0000e+00,
           0.0000e+00,  2.0000e+00],
         [ 6.0010e-03,  9.5810e-03,  6.7220e-03,  ...,  0.0000e+00,
           0.0000e+00,  3.0000e+00],
         ...,
         [ 5.4969e-03, -2

In [42]:
model(batch["features"].to(device))

(tensor([[-0.0027, -0.0035,  0.0628,  0.1169, -0.0021,  0.0405, -0.0047,  0.0124,
           0.1442,  0.1665,  0.2067],
         [ 0.0007,  0.0028,  0.0522,  0.1206,  0.0056,  0.0433, -0.0041,  0.0168,
           0.1546,  0.1654,  0.2011],
         [ 0.0033,  0.0052,  0.0473,  0.1214,  0.0084,  0.0451, -0.0036,  0.0181,
           0.1588,  0.1657,  0.1994],
         [ 0.0049,  0.0061,  0.0448,  0.1220,  0.0088,  0.0462, -0.0027,  0.0180,
           0.1605,  0.1665,  0.1992],
         [ 0.0060,  0.0066,  0.0432,  0.1225,  0.0090,  0.0472, -0.0020,  0.0179,
           0.1615,  0.1669,  0.1988],
         [ 0.0064,  0.0063,  0.0415,  0.1230,  0.0093,  0.0479, -0.0014,  0.0182,
           0.1622,  0.1675,  0.1980],
         [ 0.0066,  0.0059,  0.0405,  0.1232,  0.0094,  0.0482, -0.0012,  0.0185,
           0.1627,  0.1680,  0.1975],
         [ 0.0066,  0.0057,  0.0395,  0.1235,  0.0095,  0.0484, -0.0012,  0.0190,
           0.1630,  0.1685,  0.1970],
         [ 0.0065,  0.0056,  0.0387,  0.

In [43]:
train_dataset.sequences[-5:]

[(array([[-1.96385808e-02, -2.22870000e-02, -3.19420000e-02, ...,
           0.00000000e+00,  0.00000000e+00,  9.88000000e+02],
         [ 7.90169919e-03,  1.29550000e-02,  7.79200000e-03, ...,
           0.00000000e+00,  0.00000000e+00,  9.89000000e+02],
         [ 1.24027667e-02,  2.23980000e-02,  3.01470000e-02, ...,
           0.00000000e+00,  0.00000000e+00,  9.90000000e+02],
         ...,
         [-1.91282829e-03, -1.32400000e-02, -3.54850000e-02, ...,
           0.00000000e+00,  0.00000000e+00,  9.92000000e+02],
         [ 1.45081680e-01,  1.42930000e-02, -3.86700000e-03, ...,
           0.00000000e+00,  0.00000000e+00,  9.93000000e+02],
         [-1.70993644e-02, -7.50000000e-03, -6.12600000e-03, ...,
           0.00000000e+00,  0.00000000e+00,  9.94000000e+02]]),
  array([-0.00996145,  0.00996145, -0.42433243,  0.42433243, -0.13991876,
          0.13991876, -0.06378652,  0.06378652,  0.55133603,  0.40099837,
          0.31851795]),
  '10104'),
 (array([[ 7.90169919e-03,  1.29

In [44]:
np.array(stock_embeddings["10104"]).shape

(256,)