## Import Package

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Load Data

In [3]:
df = pd.read_csv('/content/gdrive/My Drive/Capstone/d.Shuo/filtered_data.csv')

In [4]:
df

Unnamed: 0,PERMNO,date,CUSIP,COMNAM,TICKER,NAICS,PRIMEXCH,RET,TRDSTAT
0,10026,2023-01-03,46603210,J & J SNACK FOODS CORP,JJSF,311930.0,Q,0.011823,A
1,10026,2023-01-04,46603210,J & J SNACK FOODS CORP,JJSF,311930.0,Q,-0.001716,A
2,10026,2023-01-05,46603210,J & J SNACK FOODS CORP,JJSF,311930.0,Q,-0.010713,A
3,10026,2023-01-06,46603210,J & J SNACK FOODS CORP,JJSF,311930.0,Q,0.020321,A
4,10026,2023-01-09,46603210,J & J SNACK FOODS CORP,JJSF,311930.0,Q,-0.019851,A
...,...,...,...,...,...,...,...,...,...
2197350,93436,2023-12-22,88160R10,TESLA INC,TSLA,336110.0,Q,-0.007701,A
2197351,93436,2023-12-26,88160R10,TESLA INC,TSLA,336110.0,Q,0.016116,A
2197352,93436,2023-12-27,88160R10,TESLA INC,TSLA,336110.0,Q,0.018822,A
2197353,93436,2023-12-28,88160R10,TESLA INC,TSLA,336110.0,Q,-0.031594,A


In [5]:
df.dtypes

PERMNO        int64
date         object
CUSIP        object
COMNAM       object
TICKER       object
NAICS       float64
PRIMEXCH     object
RET         float64
TRDSTAT      object
dtype: object

## Feature Engineering

In [6]:
data_1 = df[['date', 'TICKER', 'RET']]
data_1

Unnamed: 0,date,TICKER,RET
0,2023-01-03,JJSF,0.011823
1,2023-01-04,JJSF,-0.001716
2,2023-01-05,JJSF,-0.010713
3,2023-01-06,JJSF,0.020321
4,2023-01-09,JJSF,-0.019851
...,...,...,...
2197350,2023-12-22,TSLA,-0.007701
2197351,2023-12-26,TSLA,0.016116
2197352,2023-12-27,TSLA,0.018822
2197353,2023-12-28,TSLA,-0.031594


In [7]:
# Preprocess Data
data_1['log_return'] = np.log(data_1['RET'] + 1)
data_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1['log_return'] = np.log(data_1['RET'] + 1)


Unnamed: 0,date,TICKER,RET,log_return
0,2023-01-03,JJSF,0.011823,0.011754
1,2023-01-04,JJSF,-0.001716,-0.001717
2,2023-01-05,JJSF,-0.010713,-0.010771
3,2023-01-06,JJSF,0.020321,0.020117
4,2023-01-09,JJSF,-0.019851,-0.020051
...,...,...,...,...
2197350,2023-12-22,TSLA,-0.007701,-0.007731
2197351,2023-12-26,TSLA,0.016116,0.015988
2197352,2023-12-27,TSLA,0.018822,0.018647
2197353,2023-12-28,TSLA,-0.031594,-0.032104


In [8]:
data_2 = data_1[['date', 'TICKER', 'log_return']]
data_2

Unnamed: 0,date,TICKER,log_return
0,2023-01-03,JJSF,0.011754
1,2023-01-04,JJSF,-0.001717
2,2023-01-05,JJSF,-0.010771
3,2023-01-06,JJSF,0.020117
4,2023-01-09,JJSF,-0.020051
...,...,...,...
2197350,2023-12-22,TSLA,-0.007731
2197351,2023-12-26,TSLA,0.015988
2197352,2023-12-27,TSLA,0.018647
2197353,2023-12-28,TSLA,-0.032104


In [9]:
# Data Split - Train, Valid, and Test

In [10]:
# data = data_2
# data

In [11]:
# # Split the data into training, validation, and test sets
# data_len = len(data)
# train_size = int(0.7 * data_len)
# valid_size = int(0.15 * data_len)
# test_size = data_len - train_size - valid_size

# train_data = data.iloc[:train_size]
# valid_data = data.iloc[train_size:train_size + valid_size]
# test_data = data.iloc[train_size + valid_size:]

In [12]:
# train_data

In [13]:
# valid_data

In [14]:
# test_data

## Model

In [15]:
data = data_2
data

Unnamed: 0,date,TICKER,log_return
0,2023-01-03,JJSF,0.011754
1,2023-01-04,JJSF,-0.001717
2,2023-01-05,JJSF,-0.010771
3,2023-01-06,JJSF,0.020117
4,2023-01-09,JJSF,-0.020051
...,...,...,...
2197350,2023-12-22,TSLA,-0.007731
2197351,2023-12-26,TSLA,0.015988
2197352,2023-12-27,TSLA,0.018647
2197353,2023-12-28,TSLA,-0.032104


In [16]:
data.groupby('TICKER')['date'].describe()

Unnamed: 0_level_0,count,unique,top,freq
TICKER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,250,250,2023-01-03,1
AA,250,250,2023-01-03,1
AAA,250,250,2023-01-03,1
AAAU,250,250,2023-01-03,1
AACG,250,250,2023-01-03,1
...,...,...,...,...
ZVRA,211,211,2023-03-01,1
ZVSA,250,250,2023-01-03,1
ZWS,250,250,2023-01-03,1
ZYME,250,250,2023-01-03,1


In [17]:
data[data['TICKER']== 'ZVRA']

Unnamed: 0,date,TICKER,log_return
304303,2023-03-01,ZVRA,-0.062132
304304,2023-03-02,ZVRA,-0.024098
304305,2023-03-03,ZVRA,0.011194
304306,2023-03-06,ZVRA,-0.007449
304307,2023-03-07,ZVRA,-0.216159
...,...,...,...
304509,2023-12-22,ZVRA,0.014185
304510,2023-12-26,ZVRA,-0.004032
304511,2023-12-27,ZVRA,0.101718
304512,2023-12-28,ZVRA,0.039361


In [18]:
first_month_data = data_2[data_2['date'] < '2023-02-01']
first_month_data.head(30)

Unnamed: 0,date,TICKER,log_return
0,2023-01-03,JJSF,0.011754
1,2023-01-04,JJSF,-0.001717
2,2023-01-05,JJSF,-0.010771
3,2023-01-06,JJSF,0.020117
4,2023-01-09,JJSF,-0.020051
5,2023-01-10,JJSF,0.001736
6,2023-01-11,JJSF,0.016675
7,2023-01-12,JJSF,0.000262
8,2023-01-13,JJSF,0.013424
9,2023-01-17,JJSF,-0.00175


### Check and make sure no missing values

In [19]:
first_month_data.dtypes

date           object
TICKER         object
log_return    float64
dtype: object

In [20]:
first_month_data[first_month_data['log_return'].isnull() == 1]

Unnamed: 0,date,TICKER,log_return


In [21]:
first_month_data[first_month_data['date'].isnull() == 1]

Unnamed: 0,date,TICKER,log_return


In [22]:
first_month_data.isnull().sum()

date          0
TICKER        0
log_return    0
dtype: int64

In [23]:
len(first_month_data)

171105

### Build Model & Get Embedding

#### Model-1
1. The current model does not use positional encodings.
2. Need to add ordinal position for each return.
3. Need to have 1 modified transformer encoder layer without residual connections + (n-1) standard layers
4. Need to connect the output of the first transformer block to the embedding layer of the fi- nal encoder block

In [24]:
first_month_data[:100].head(30)

Unnamed: 0,date,TICKER,log_return
0,2023-01-03,JJSF,0.011754
1,2023-01-04,JJSF,-0.001717
2,2023-01-05,JJSF,-0.010771
3,2023-01-06,JJSF,0.020117
4,2023-01-09,JJSF,-0.020051
5,2023-01-10,JJSF,0.001736
6,2023-01-11,JJSF,0.016675
7,2023-01-12,JJSF,0.000262
8,2023-01-13,JJSF,0.013424
9,2023-01-17,JJSF,-0.00175


In [25]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

class FinancialDataset(Dataset):
    def __init__(self, data, window_size=7):
        self.data = data
        self.window_size = window_size
        self.data['log_return'] = pd.to_numeric(self.data['log_return'], errors='coerce')
        self.data = self.data.dropna()

        # Group by ticker
        self.groups = self.data.groupby('TICKER')
        self.sequences = []

        for name, group in self.groups:
            if len(group) > self.window_size:
                for i in range(len(group) - self.window_size):
                    X = group['log_return'].iloc[i:i+self.window_size].values
                    y = group['log_return'].iloc[i+self.window_size]
                    self.sequences.append((X, y, name))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        X, y, ticker = self.sequences[idx]
        return {'features': torch.tensor(X, dtype=torch.float), 'target': torch.tensor(y, dtype=torch.float), 'ticker': ticker}



class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, n_heads, ff_dim, n_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, x):
        x = self.embedding(x)  # Apply embedding layer
        x = x.unsqueeze(1)  # Add a dimension for the transformer (batch_size, seq_len, embed_dim)
        x = self.transformer(x) # Transformer expects input of shape (seq_len, batch_size, embed_dim)
        # return self.fc(x[-1])
        embedding = x[:, -1, :] # Get the last output token
        x = self.fc(x[:, -1, :])  # Fully connected layer on the last output token
        return x.squeeze(-1), embedding  # Return the final output and the embeddings

# Prepare Data
dataset = FinancialDataset(first_month_data[:100])
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

# Initialize Model
model = TransformerModel(input_dim=7, embed_dim=64, n_heads=8, ff_dim=256, n_layers=4)

# Training Loop
optimizer = torch.optim.RAdam(model.parameters(), lr=0.001)
# criterion = nn.MSELoss()
criterion = nn.L1Loss()

for epoch in range(1):
    for batch in dataloader:
        optimizer.zero_grad()
        output = model(batch['features'])
        loss = criterion(output[0].squeeze(), batch['target'])
        loss.backward()
        optimizer.step()

print("Training Complete")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['log_return'] = pd.to_numeric(self.data['log_return'], errors='coerce')


Training Complete


  return F.l1_loss(input, target, reduction=self.reduction)


In [26]:
# Extract embeddings for each stock
stock_embeddings = {}
model.eval()
with torch.no_grad():
    for batch in dataloader:
        _, embeddings = model(batch['features'])
        for i, ticker in enumerate(batch['ticker']):
            if ticker not in stock_embeddings:
                stock_embeddings[ticker] = []
            stock_embeddings[ticker].append(embeddings[i].cpu().numpy())

In [27]:
# Average the embeddings for each stock
for ticker in stock_embeddings:
    stock_embeddings[ticker] = np.mean(stock_embeddings[ticker], axis=0)

In [28]:
stock_embeddings

{'ADX': array([-0.09489898, -0.4284712 ,  0.75870585,  0.48658612, -1.2541337 ,
        -1.6493136 ,  3.0062573 ,  0.48599756,  0.7541896 , -0.5966588 ,
         1.0931239 , -1.8179427 , -1.1939106 , -0.01696481,  1.5204116 ,
         0.47683018, -0.27520698,  0.64082474,  0.345736  , -0.40880162,
         2.3979683 ,  0.5597917 , -0.46798643, -0.88326114,  1.5169983 ,
         0.93017304,  0.9201739 , -0.93523383,  1.0611819 ,  0.1688175 ,
         0.1674164 , -0.6706301 , -0.44997597, -0.81493133, -0.7303518 ,
         1.955812  ,  0.5042389 , -0.3660622 ,  0.42791355,  0.09001343,
        -0.4572457 ,  0.27014428,  0.79531723, -0.721762  ,  0.819721  ,
         0.02132408, -1.0475535 ,  0.25727007, -1.7370775 ,  0.11806122,
        -0.47296003, -1.3714905 ,  0.49984518, -1.3548615 , -0.2847164 ,
        -0.12699822,  1.9431201 , -1.1936442 , -1.3570058 , -0.12548524,
        -0.31846634, -0.33909383, -1.367096  ,  0.3306331 ], dtype=float32),
 'ELA': array([-0.11883219, -0.41992334,

In [29]:
for batch in dataloader:
    print(batch)
    break

{'features': tensor([[-0.0021,  0.0055, -0.0159,  0.0227, -0.0027,  0.0075,  0.0094],
        [ 0.0055, -0.0159,  0.0227, -0.0027,  0.0075,  0.0094,  0.0087],
        [-0.0159,  0.0227, -0.0027,  0.0075,  0.0094,  0.0087,  0.0033],
        [ 0.0227, -0.0027,  0.0075,  0.0094,  0.0087,  0.0033,  0.0013],
        [-0.0027,  0.0075,  0.0094,  0.0087,  0.0033,  0.0013, -0.0160],
        [ 0.0075,  0.0094,  0.0087,  0.0033,  0.0013, -0.0160, -0.0074],
        [ 0.0094,  0.0087,  0.0033,  0.0013, -0.0160, -0.0074,  0.0208],
        [ 0.0087,  0.0033,  0.0013, -0.0160, -0.0074,  0.0208,  0.0086],
        [ 0.0033,  0.0013, -0.0160, -0.0074,  0.0208,  0.0086, -0.0020],
        [ 0.0013, -0.0160, -0.0074,  0.0208,  0.0086, -0.0020, -0.0007],
        [-0.0160, -0.0074,  0.0208,  0.0086, -0.0020, -0.0007,  0.0151],
        [-0.0074,  0.0208,  0.0086, -0.0020, -0.0007,  0.0151,  0.0058],
        [ 0.0208,  0.0086, -0.0020, -0.0007,  0.0151,  0.0058, -0.0078],
        [ 0.0000, -0.0076,  0.0000,  0

In [30]:
model(batch['features'])

(tensor([0.1966, 0.2225, 0.2024, 0.2184, 0.2178, 0.2229, 0.2124, 0.1926, 0.1876,
         0.2085, 0.2261, 0.2071, 0.2060, 0.1978, 0.2346, 0.2656, 0.2104, 0.2050,
         0.2188, 0.2522, 0.1583, 0.2102, 0.2074, 0.2704, 0.2203, 0.2360, 0.2026,
         0.2096, 0.1892, 0.2105, 0.2213, 0.2270, 0.2209, 0.1981, 0.1641, 0.1731,
         0.1831, 0.2260, 0.2450, 0.2221, 0.2454, 0.2246, 0.2068, 0.2097, 0.2233,
         0.2268, 0.1915, 0.1936, 0.2018, 0.2276, 0.3004, 0.2007, 0.2065, 0.2090,
         0.1898, 0.1844, 0.1490, 0.1964, 0.2201, 0.2615, 0.1942, 0.1954, 0.2110,
         0.2244], grad_fn=<SqueezeBackward1>),
 tensor([[-0.0742, -0.4230,  0.7726,  ..., -0.3206, -1.3721,  0.3392],
         [-0.1128, -0.4251,  0.7389,  ..., -0.3437, -1.3393,  0.3263],
         [-0.0850, -0.4268,  0.7741,  ..., -0.3422, -1.3733,  0.3351],
         ...,
         [-0.0900, -0.4778,  0.7463,  ..., -0.3244, -1.3631,  0.2959],
         [-0.0933, -0.4073,  0.7650,  ..., -0.3174, -1.3647,  0.3349],
         [-0.1009

In [31]:
dataset.sequences

[(array([-0.00206513,  0.00549786, -0.01588957,  0.02271994, -0.00272671,
          0.00748095,  0.0094403 ]),
  0.008687157147454595,
  'ADX'),
 (array([ 0.00549786, -0.01588957,  0.02271994, -0.00272671,  0.00748095,
          0.0094403 ,  0.00868716]),
  0.0033214777803933435,
  'ADX'),
 (array([-0.01588957,  0.02271994, -0.00272671,  0.00748095,  0.0094403 ,
          0.00868716,  0.00332148]),
  0.0013251216383858657,
  'ADX'),
 (array([ 0.02271994, -0.00272671,  0.00748095,  0.0094403 ,  0.00868716,
          0.00332148,  0.00132512]),
  -0.01602166415441571,
  'ADX'),
 (array([-0.00272671,  0.00748095,  0.0094403 ,  0.00868716,  0.00332148,
          0.00132512, -0.01602166]),
  -0.007429530741163067,
  'ADX'),
 (array([ 0.00748095,  0.0094403 ,  0.00868716,  0.00332148,  0.00132512,
         -0.01602166, -0.00742953]),
  0.020799189386714323,
  'ADX'),
 (array([ 0.0094403 ,  0.00868716,  0.00332148,  0.00132512, -0.01602166,
         -0.00742953,  0.02079919]),
  0.008594957303

#### Model-2
add ordinal position for each return

In [32]:
data

Unnamed: 0,date,TICKER,log_return
0,2023-01-03,JJSF,0.011754
1,2023-01-04,JJSF,-0.001717
2,2023-01-05,JJSF,-0.010771
3,2023-01-06,JJSF,0.020117
4,2023-01-09,JJSF,-0.020051
...,...,...,...
2197350,2023-12-22,TSLA,-0.007731
2197351,2023-12-26,TSLA,0.015988
2197352,2023-12-27,TSLA,0.018647
2197353,2023-12-28,TSLA,-0.032104


In [62]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

class FinancialDataset(Dataset):
    def __init__(self, data, window_size=7):
        self.data = data
        self.window_size = window_size
        self.data['log_return'] = pd.to_numeric(self.data['log_return'], errors='coerce')
        self.data = self.data.dropna()

        # Group by ticker
        self.groups = self.data.groupby('TICKER')
        self.sequences = []

        for name, group in self.groups:
            j = self.window_size
            if len(group) > self.window_size:
                for i in range(len(group) - self.window_size):
                    X = group['log_return'].iloc[i:i+self.window_size].values
                    X = np.append(X, j)
                    j += 1

                    # print(X)
                    # t = np.arange(1, self.window_size + 1)  # Relative positions
                    # X = np.column_stack((X, t))  # Concatenate returns with positions

                    y = group['log_return'].iloc[i+self.window_size]
                    self.sequences.append((X, y, name))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        X, y, ticker = self.sequences[idx]
        return {'features': torch.tensor(X, dtype=torch.float), 'target': torch.tensor(y, dtype=torch.float), 'ticker': ticker}



class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, n_heads, ff_dim, n_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, x):
        x = self.embedding(x)  # Apply embedding layer
        x = x.unsqueeze(1)  # Add a dimension for the transformer (batch_size, seq_len, embed_dim)
        x = self.transformer(x)  # Transformer expects input of shape (seq_len, batch_size, embed_dim)
        embedding = x[:, -1, :]  # Get the last output token
        x = self.fc(embedding)  # Fully connected layer on the last output token
        return x.squeeze(-1), embedding  # Return the final output and the embeddings


# Prepare Data
dataset = FinancialDataset(first_month_data[:100])
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

# Initialize Model
model = TransformerModel(input_dim=8, embed_dim=64, n_heads=8, ff_dim=256, n_layers=4)


# Training Loop
optimizer = torch.optim.RAdam(model.parameters(), lr=0.001)
criterion = nn.L1Loss()

for epoch in range(1):
    for batch in dataloader:
        optimizer.zero_grad()
        output, _ = model(batch['features'])
        loss = criterion(output, batch['target'])
        # print(f'loss: {loss}')
        loss.backward()
        optimizer.step()

print("Training Complete")


Training Complete


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['log_return'] = pd.to_numeric(self.data['log_return'], errors='coerce')


In [55]:
batch['features']

tensor([[-1.8697e-03,  1.8183e-02,  0.0000e+00,  9.0059e-04, -8.1340e-03,
         -5.4599e-03,  3.6434e-03,  1.9000e+01]])

In [56]:
for batch in dataloader:
    print(batch)
    break

{'features': tensor([[-2.0651e-03,  5.4979e-03, -1.5890e-02,  2.2720e-02, -2.7267e-03,
          7.4809e-03,  9.4403e-03,  7.0000e+00],
        [ 5.4979e-03, -1.5890e-02,  2.2720e-02, -2.7267e-03,  7.4809e-03,
          9.4403e-03,  8.6872e-03,  8.0000e+00],
        [-1.5890e-02,  2.2720e-02, -2.7267e-03,  7.4809e-03,  9.4403e-03,
          8.6872e-03,  3.3215e-03,  9.0000e+00],
        [ 2.2720e-02, -2.7267e-03,  7.4809e-03,  9.4403e-03,  8.6872e-03,
          3.3215e-03,  1.3251e-03,  1.0000e+01],
        [-2.7267e-03,  7.4809e-03,  9.4403e-03,  8.6872e-03,  3.3215e-03,
          1.3251e-03, -1.6022e-02,  1.1000e+01],
        [ 7.4809e-03,  9.4403e-03,  8.6872e-03,  3.3215e-03,  1.3251e-03,
         -1.6022e-02, -7.4295e-03,  1.2000e+01],
        [ 9.4403e-03,  8.6872e-03,  3.3215e-03,  1.3251e-03, -1.6022e-02,
         -7.4295e-03,  2.0799e-02,  1.3000e+01],
        [ 8.6872e-03,  3.3215e-03,  1.3251e-03, -1.6022e-02, -7.4295e-03,
          2.0799e-02,  8.5950e-03,  1.4000e+01],
   

In [57]:
# Extract embeddings for each stock
stock_embeddings = {}
model.eval()
with torch.no_grad():
    for batch in dataloader:
        _, embeddings = model(batch['features'])
        for i, ticker in enumerate(batch['ticker']):
            if ticker not in stock_embeddings:
                stock_embeddings[ticker] = []
            stock_embeddings[ticker].append(embeddings[i].cpu().numpy())

In [58]:
np.array(stock_embeddings['ADX']).shape

(13, 64)

In [59]:
first_month_data[first_month_data['TICKER']=='ADX']

Unnamed: 0,date,TICKER,log_return
1000,2023-01-03,ADX,-0.002065
1001,2023-01-04,ADX,0.005498
1002,2023-01-05,ADX,-0.01589
1003,2023-01-06,ADX,0.02272
1004,2023-01-09,ADX,-0.002727
1005,2023-01-10,ADX,0.007481
1006,2023-01-11,ADX,0.00944
1007,2023-01-12,ADX,0.008687
1008,2023-01-13,ADX,0.003321
1009,2023-01-17,ADX,0.001325


In [60]:
# Average the embeddings for each stock
for ticker in stock_embeddings:
    stock_embeddings[ticker] = np.mean(stock_embeddings[ticker], axis=0)

In [61]:
stock_embeddings

{'ADX': array([ 0.56134075, -0.2788757 ,  0.60705185, -1.5672071 , -1.3693109 ,
        -1.421627  ,  0.29519475,  0.3477082 , -0.01353939, -0.85386175,
         1.1046065 ,  1.2438439 ,  2.0971365 ,  0.3012602 ,  0.41584155,
         0.03679939, -3.0424128 ,  0.57923114,  0.28293073,  0.10934129,
        -0.01833557,  0.6359041 , -0.8206408 ,  0.16360642, -0.3107761 ,
         1.2215368 ,  1.4740071 , -0.72096825, -0.01733635, -0.11769906,
         1.0622628 , -1.9786102 , -0.8230746 ,  0.6387139 ,  0.9444701 ,
         1.0522118 ,  0.41051385,  1.3697842 ,  1.6982778 ,  0.5211303 ,
        -1.9941661 , -2.2486353 ,  0.46832561,  0.3291526 , -0.37360987,
        -0.6467847 , -0.88605416, -1.3752028 ,  0.32203498,  0.11132046,
        -0.39993426,  0.2860389 ,  1.1876905 ,  0.9490039 ,  0.3063851 ,
         0.72065663,  0.04091819, -0.7049779 ,  0.17551377,  0.14730503,
        -1.2451148 ,  0.689332  , -0.9962528 , -0.68159527], dtype=float32),
 'ELA': array([ 0.5608593 , -0.28061265,

## Model-3

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

class FinancialDataset(Dataset):
    def __init__(self, data, window_size=7):
        self.data = data
        self.window_size = window_size
        self.data['log_return'] = pd.to_numeric(self.data['log_return'], errors='coerce')
        self.data = self.data.dropna()

        # Group by ticker
        self.groups = self.data.groupby('TICKER')
        self.sequences = []

        for name, group in self.groups:
            j = self.window_size
            if len(group) > self.window_size:
                for i in range(len(group) - self.window_size):
                    X = group['log_return'].iloc[i:i+self.window_size].values
                    X = np.append(X, j)
                    j += 1

                    # print(X)
                    # t = np.arange(1, self.window_size + 1)  # Relative positions
                    # X = np.column_stack((X, t))  # Concatenate returns with positions

                    y = group['log_return'].iloc[i+self.window_size]
                    self.sequences.append((X, y, name))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        X, y, ticker = self.sequences[idx]
        return {'features': torch.tensor(X, dtype=torch.float), 'target': torch.tensor(y, dtype=torch.float), 'ticker': ticker}



class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, n_heads, ff_dim, n_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads, dim_feedforward=ff_dim)
        self.transformer1 = nn.TransformerEncoder(encoder_layer, num_layers=1)
        self.transformer2 = nn.TransformerEncoder(encoder_layer, num_layers=n_layers-1)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, x):
        x = self.embedding(x)  # Apply embedding layer
        x = x.unsqueeze(1)  # Add a dimension for the transformer (batch_size, seq_len, embed_dim)
        x1 = self.transformer1(x)  # Transformer expects input of shape (seq_len, batch_size, embed_dim)
        x2 = self.transformer2(x1)
        print(x2)
        embedding = x2[:, -1, :]  # Get the last output token
        x2 = self.fc(embedding)  # Fully connected layer on the last output token
        return x2.squeeze(-1), embedding  # Return the final output and the embeddings


# Prepare Data
dataset = FinancialDataset(first_month_data[:100])
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

# Initialize Model
model = TransformerModel(input_dim=8, embed_dim=64, n_heads=8, ff_dim=256, n_layers=4)


# Training Loop
optimizer = torch.optim.RAdam(model.parameters(), lr=0.001)
criterion = nn.L1Loss()

for epoch in range(1):
    for batch in dataloader:
        optimizer.zero_grad()
        output, _ = model(batch['features'])
        loss = criterion(output, batch['target'])
        # print(f'loss: {loss}')
        loss.backward()
        optimizer.step()

print("Training Complete")
