In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import yfinance as yf  # for fetching stock data
import matplotlib.pyplot as plt
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss, MAE

In [None]:
# Code for getting tickers of the Top 200 companies by market cap
# Load the S&P 500 company tickers from Wikipedia
# url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
# sp500_table = pd.read_html(url)
# sp500_df = sp500_table[0]  # The first table on the page is usually the one we want
# tickers = sp500_df['Symbol'].tolist()  # Get the list of tickers

# # Function to get the market cap for each ticker
# def get_market_cap(ticker):
#     try:
#         stock = yf.Ticker(ticker)
#         market_cap = stock.info.get('marketCap')
#         return market_cap
#     except Exception as e:
#         print(f"Error fetching data for {ticker}: {e}")
#         return None

# # Create a list to store the tickers and their market caps
# ticker_data = []

# for ticker in tickers:
#     market_cap = get_market_cap(ticker)
#     if market_cap is not None:
#         ticker_data.append({'Ticker': ticker, 'MarketCap': market_cap})

# # Create a DataFrame and sort by market cap
# ticker_df = pd.DataFrame(ticker_data)
# ticker_df = ticker_df.sort_values(by='MarketCap', ascending=False)

# # Select the top 200 companies by market cap
# top_200 = ticker_df.head(200)
# print(top_200['Ticker'].tolist())  # Print the list of top 200 tickers


In [2]:
def convert_data_type(df, time_cols=[], float_cols=[], cat_cols=[]):
    for col in time_cols:
        df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
    for col in float_cols:
        df[col] = df[col].astype(float)
    for col in cat_cols:
        df[col] = df[col].astype('category')
    return df

# Define the stock tickers and the date range
tickers = ['AAPL', 'NVDA', 'MSFT', 'GOOG', 'GOOGL', 'AMZN', 'META', 'TSLA', 'AVGO', 'LLY', 'WMT', 'JPM', 'V', 'UNH', 'XOM', 'ORCL', 'MA', 'HD', 'PG', 'COST', 'JNJ', 'ABBV', 'NFLX', 'BAC', 'CRM', 'KO', 'CVX', 'TMUS', 'MRK', 'AMD', 'PEP', 'CSCO', 'LIN', 'ACN', 'WFC', 'TMO', 'ADBE', 'MCD', 'ABT', 'BX', 'PM', 'NOW', 'IBM', 'AXP', 'MS', 'TXN', 'GE', 'QCOM', 'CAT', 'ISRG', 'DHR', 'INTU', 'VZ', 'DIS', 'AMGN', 'CMCSA', 'GS', 'PFE', 'NEE', 'T', 'RTX', 'BKNG', 'UBER', 'AMAT', 'SPGI', 'LOW', 'BLK', 'PGR', 'UNP', 'SYK', 'HON', 'ETN', 'SCHW', 'LMT', 'KKR', 'TJX', 'COP', 'ANET', 'BSX', 'VRTX', 'C', 'PANW', 'ADP', 'NKE', 'BA', 'MDT', 'FI', 'UPS', 'SBUX', 'ADI', 'CB', 'GILD', 'MU', 'BMY', 'DE', 'MMC', 'PLD', 'INTC', 'AMT', 'SO', 'LRCX', 'ELV', 'PLTR', 'REGN', 'DELL', 'MDLZ', 'MO', 'HCA', 'SHW', 'KLAC', 'ICE', 'CI', 'DUK', 'ABNB', 'WM', 'EQIX', 'TT', 'GEV', 'WELL', 'CTAS', 'MCO', 'ZTS', 'APH', 'PH', 'CEG', 'GD', 'CME', 'CMG', 'SNPS', 'AON', 'ITW', 'PYPL', 'CDNS', 'CL', 'MSI', 'USB', 'CRWD', 'PNC', 'NOC', 'MAR', 'TDG', 'CVS', 'TGT', 'MMM', 'ECL', 'APD', 'EOG', 'MCK', 'BDX', 'FDX', 'ORLY', 'FCX', 'CARR', 'CSX', 'SPG', 'WMB', 'COF', 'RSG', 'EMR', 'ADSK', 'AJG', 'NXPI', 'FTNT', 'DLR', 'AFL', 'ROP', 'HLT', 'TFC', 'PSA', 'NSC', 'SLB', 'GM', 'TRV', 'BK', 'OKE', 'MET', 'RCL', 'DHI', 'PCAR', 'GWW', 'KMI', 'PCG', 'CHTR', 'URI', 'AEP', 'SRE', 'NEM', 'FANG', 'O', 'MNST', 'JCI', 'AZO', 'PAYX', 'CPRT', 'PSX', 'D', 'AMP', 'FICO', 'ALL', 'AIG']
start_date = "2018-06-01"
end_date = "2024-06-30"

In [3]:
# Download historical data from yfinance
df = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')

[*********************100%***********************]  200 of 200 completed


In [7]:
df.head()

Ticker,AIG,AIG,AIG,AIG,AIG,AIG,KLAC,KLAC,KLAC,KLAC,...,BKNG,BKNG,BKNG,BKNG,TXN,TXN,TXN,TXN,TXN,TXN
Price,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-06-01 00:00:00+00:00,53.41,53.630001,53.130001,53.290001,44.729973,4449000,114.510002,116.419998,113.849998,115.839996,...,2118.050049,2128.939941,2113.727295,301500,112.550003,114.800003,112.32,114.800003,95.804886,4728400
2018-06-04 00:00:00+00:00,53.610001,54.169998,53.48,53.779999,45.141266,3634900,115.279999,117.120003,115.169998,116.239998,...,2125.26001,2125.800049,2110.609863,284700,114.739998,116.379997,114.309998,115.739998,96.589325,3905700
2018-06-05 00:00:00+00:00,53.630001,53.939999,53.34,53.84,45.191616,3947300,117.040001,118.190002,116.57,118.129997,...,2126.0,2140.98999,2125.691406,339900,116.690002,117.93,115.830002,117.660004,98.19165,5366600
2018-06-06 00:00:00+00:00,53.93,54.130001,53.25,53.939999,45.275555,5892500,118.139999,118.559998,116.900002,117.43,...,2129.030029,2153.780029,2138.389893,297700,117.57,118.120003,116.910004,118.110001,98.5672,4015400
2018-06-07 00:00:00+00:00,54.0,54.0,53.419998,53.869999,45.216805,5306500,115.449997,116.57,114.449997,115.93,...,2121.919922,2131.97998,2116.745361,293800,118.349998,118.480003,116.139999,117.239998,97.841133,6985100


In [9]:
# Convert the multi-index DataFrame to a single DataFrame
df = df.stack(level=0).reset_index()
df.head()

Price,Date,Ticker,Open,High,Low,Close,Adj Close,Volume
0,2018-06-01 00:00:00+00:00,AAPL,46.997501,47.564999,46.9375,47.560001,45.159534,93770000.0
1,2018-06-01 00:00:00+00:00,ABBV,97.93,99.43,97.900002,98.050003,73.350159,14837300.0
2,2018-06-01 00:00:00+00:00,ABT,61.959999,62.490002,61.849998,62.41,55.784615,3595800.0
3,2018-06-01 00:00:00+00:00,ACN,156.940002,159.279999,156.559998,158.889999,144.475052,1808800.0
4,2018-06-01 00:00:00+00:00,ADBE,250.550003,251.830002,248.850006,251.309998,251.309998,2773200.0


In [11]:
df.columns = ['time', 'ticker', 'open', 'high', 'low', 'close', 'adjclose', 'volume']
df = convert_data_type(df, time_cols=['time'], float_cols=['open', 'high', 'low', 'close', 'volume'], cat_cols=['ticker'])
df.head()

Unnamed: 0,time,ticker,open,high,low,close,adjclose,volume
0,2018-06-01 00:00:00+00:00,AAPL,46.997501,47.564999,46.9375,47.560001,45.159534,93770000.0
1,2018-06-01 00:00:00+00:00,ABBV,97.93,99.43,97.900002,98.050003,73.350159,14837300.0
2,2018-06-01 00:00:00+00:00,ABT,61.959999,62.490002,61.849998,62.41,55.784615,3595800.0
3,2018-06-01 00:00:00+00:00,ACN,156.940002,159.279999,156.559998,158.889999,144.475052,1808800.0
4,2018-06-01 00:00:00+00:00,ADBE,250.550003,251.830002,248.850006,251.309998,251.309998,2773200.0


In [13]:
# Set the time as index and sort values by time
df = df.set_index('time', drop=True).reset_index()
df = df.groupby('ticker').apply(lambda x: x.sort_values('time')).reset_index(drop=True)
df.head()

  df = df.groupby('ticker').apply(lambda x: x.sort_values('time')).reset_index(drop=True)


Unnamed: 0,time,ticker,open,high,low,close,adjclose,volume
0,2018-06-01 00:00:00+00:00,AAPL,46.997501,47.564999,46.9375,47.560001,45.159534,93770000.0
1,2018-06-04 00:00:00+00:00,AAPL,47.91,48.355,47.837502,47.9575,45.536972,105064800.0
2,2018-06-05 00:00:00+00:00,AAPL,48.267502,48.485001,48.09,48.327499,45.888302,86264000.0
3,2018-06-06 00:00:00+00:00,AAPL,48.407501,48.52,47.98,48.494999,46.04734,83734400.0
4,2018-06-07 00:00:00+00:00,AAPL,48.535,48.549999,48.084999,48.365002,45.923916,85388800.0


In [15]:
# Create 'time_idx' column
df['time_idx'] = df.groupby('ticker').cumcount()
max_prediction_length = 4
max_encoder_length = 24
training_cutoff = df["time_idx"].max() - max_prediction_length * 2
df.head()

  df['time_idx'] = df.groupby('ticker').cumcount()


Unnamed: 0,time,ticker,open,high,low,close,adjclose,volume,time_idx
0,2018-06-01 00:00:00+00:00,AAPL,46.997501,47.564999,46.9375,47.560001,45.159534,93770000.0,0
1,2018-06-04 00:00:00+00:00,AAPL,47.91,48.355,47.837502,47.9575,45.536972,105064800.0,1
2,2018-06-05 00:00:00+00:00,AAPL,48.267502,48.485001,48.09,48.327499,45.888302,86264000.0,2
3,2018-06-06 00:00:00+00:00,AAPL,48.407501,48.52,47.98,48.494999,46.04734,83734400.0,3
4,2018-06-07 00:00:00+00:00,AAPL,48.535,48.549999,48.084999,48.365002,45.923916,85388800.0,4


In [17]:
# Create training and validation datasets
training = TimeSeriesDataSet(
    df[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="close",
    group_ids=["ticker"],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["ticker"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_reals=["open", "high", "low", "close", "volume"],
    target_normalizer=GroupNormalizer(groups=["ticker"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=False,
    randomize_length=True,
)

# Create validation and test datasets
validation = TimeSeriesDataSet.from_dataset(training, df, predict=True, stop_randomization=True)
test = TimeSeriesDataSet.from_dataset(training, df, predict=True, stop_randomization=True, min_encoder_length=max_encoder_length)


In [18]:
# Create dataloaders for model
batch_size = 32
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
test_dataloader = test.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
# Configure network and trainer
pl.seed_everything(42)

Seed set to 42


42

In [19]:
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=32,
    attention_head_size=5,
    dropout=0.1,
    hidden_continuous_size=32,
    loss=QuantileLoss(),
    optimizer="adam"
)

C:\Users\selvam\anaconda3\envs\dl\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
C:\Users\selvam\anaconda3\envs\dl\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.


In [23]:
# Configure callbacks
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

In [25]:
# Train the model
trainer = pl.Trainer(
    max_epochs=50,
    accelerator="cpu",
    enable_model_summary=True,
    gradient_clip_val=0.1,
    limit_train_batches=50,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [27]:

trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)


   | Name                               | Type                            | Params | Mode 
------------------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0      | train
1  | logging_metrics                    | ModuleList                      | 0      | train
2  | input_embeddings                   | MultiEmbedding                  | 6.2 K  | train
3  | prescalers                         | ModuleDict                      | 640    | train
4  | static_variable_selection          | VariableSelectionNetwork        | 13.7 K | train
5  | encoder_variable_selection         | VariableSelectionNetwork        | 32.5 K | train
6  | decoder_variable_selection         | VariableSelectionNetwork        | 8.9 K  | train
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K  | train
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K  

Sanity Checking: |                                                                               | 0/? [00:00<…

C:\Users\selvam\anaconda3\envs\dl\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
C:\Users\selvam\anaconda3\envs\dl\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |                                                                                      | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

Validation: |                                                                                    | 0/? [00:00<…

In [29]:
# Calculate accuracy on the test set (using MAE)
predictions = tft.predict(test_dataloader, return_y=True)  # returns an object containing both predictions and actuals
preds, actuals = predictions.output, predictions.y

# Calculate MAE
mae = MAE()(preds, actuals)
print(f"Test MAE: {mae}")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\selvam\anaconda3\envs\dl\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Test MAE: 29.96747398376465
