<a href="https://colab.research.google.com/github/UoB-DSMP-2023-24/dsmp-2024-group22/blob/main/TapesAnalysisTFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import storage
gcs = storage.Client()

In [None]:
import pandas as pd
import io

bucket_name = 'jpm-tapes'
directory_path = 'Tapes/'

bucket = gcs.get_bucket(bucket_name)

In [None]:
blobs = bucket.list_blobs(prefix=directory_path)  # Lists all the blobs in the directory

all_data_frames = []  # List to store each processed DataFrame

for blob in blobs:
    if blob.name.endswith('.csv'):  # Check if the blob is a CSV file
        # Read the content of the file
        data = blob.download_as_bytes()
        data_io = io.BytesIO(data)
        df = pd.read_csv(data_io, names=['Time', 'Price', 'Quantity'])

        # Convert 'Time' to a datetime format (assuming it's in seconds from the start of the day)
        df['DateTime'] = pd.to_datetime(df['Time'], unit='s', origin=pd.Timestamp('2025-01-01'))
        df.set_index('DateTime', inplace=True)

        # Resample and calculate OHLCV
        ohlc = df['Price'].resample('1T').ohlc()
        ohlc['Volume'] = df['Quantity'].resample('1T').sum()

        # Reset the index to make DateTime a column
        ohlc.reset_index(inplace=True)

        # Append the processed DataFrame to the list
        all_data_frames.append(ohlc)

# Concatenate all DataFrames into a single DataFrame
final_data_frame = pd.concat(all_data_frames)

# Display the result
print(final_data_frame.head())
print()
print(final_data_frame.tail())
print()
print(len(final_data_frame))
# Renaming the column properly if necessary
final_data_frame.columns = ['DateTime', 'Open', 'High', 'Low', 'Close', 'Volume']

             DateTime  open  high  low  close  Volume
0 2025-01-01 00:00:00   267   270  252    261      68
1 2025-01-01 00:01:00   259   267  254    265     106
2 2025-01-01 00:02:00   261   269  250    266      89
3 2025-01-01 00:03:00   269   269  258    261      86
4 2025-01-01 00:04:00   261   270  258    269      78

               DateTime  open  high  low  close  Volume
505 2025-01-01 08:25:00   108   112  101    105     108
506 2025-01-01 08:26:00   108   112  104    107     112
507 2025-01-01 08:27:00   112   113   98     98      98
508 2025-01-01 08:28:00    98   112   98    109     100
509 2025-01-01 08:29:00   107   112  103    107     112

63750


In [None]:
!pip install pytorch-forecasting pytorch_lightning

Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.0.0-py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.4/140.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch_lightning
  Downloading pytorch_lightning-2.2.3-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.2/802.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.80 (from pytorch-forecasting)
  Downloading fastapi-0.110.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.2.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting optuna<4.0.0,>=3.1.0 (from pytorch-forecasting)
  Downloading optuna-3.

In [None]:
import pandas as pd
import torch
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data.encoders import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
import pytorch_lightning as pl  # Correct the import for pytorch_lightning
import numpy as np

from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

import lightning.pytorch as pl # Instead of import pytorch_lightning as pl

from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import LearningRateMonitor
from sklearn.preprocessing import MinMaxScaler

# Data Preparation
final_data_frame['DateTime'] = pd.to_datetime(final_data_frame['DateTime'])
final_data_frame = final_data_frame.drop_duplicates(subset=['DateTime']).reset_index(drop=True)
final_data_frame['time_idx'] = (final_data_frame['DateTime'] - final_data_frame['DateTime'].min()).dt.total_seconds().astype(int)
final_data_frame['time_idx'] //= 60  # Convert from seconds to minutes
final_data_frame['group_id'] = 'stock_price'  # Adding a constant group_id for simplicity
scaler = MinMaxScaler()
final_data_frame[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(final_data_frame[['Open', 'High', 'Low', 'Close', 'Volume']])

# TimeSeriesDataSet Setup
max_prediction_length = 5
max_encoder_length = 20
training_cutoff = final_data_frame['time_idx'].max() - max_prediction_length

training = TimeSeriesDataSet(
    final_data_frame[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="Close",
    group_ids=["group_id"],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=[],
    static_reals=[],
    time_varying_known_categoricals=[],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=["Open", "High", "Low", "Close", "Volume"],
    target_normalizer=GroupNormalizer(groups=["group_id"], transformation="log1p"),  # Changed transformation to log1p
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# DataLoader for Training and Validation
batch_size = 16
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=4)
val_dataloader = training.to_dataloader(train=False, batch_size=batch_size, num_workers=4)
# Number of data points in training DataLoader
train_data_points = len(train_dataloader.dataset)
print(f"Total training data points: {train_data_points}")

# Number of data points in validation DataLoader
val_data_points = len(val_dataloader.dataset)
print(f"Total validation data points: {val_data_points}")

# PyTorch Lightning Trainer Configuration
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=100)

# Temporal Fusion Transformer Model Initialization
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # Predicting 7 quantiles
    loss=QuantileLoss(),
    log_interval=10,
    reduce_on_plateau_patience=4,
)

# Training the Model
trainer.fit(tft, train_dataloader, val_dataloader)

# After prediction, ensure tensors are on the same device
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)]).cpu()
predictions = tft.predict(val_dataloader).cpu()
predictions = torch.nan_to_num(predictions, nan=0.0)  # Handling NaNs in predictions

# Now compute the mean absolute error
mae = (actuals - predictions).abs().mean()
print(mae)

import numpy as np
from sklearn.metrics import r2_score

# Compute MSE using PyTorch
mse = torch.mean((actuals - predictions) ** 2)
print("MSE:", mse.item())  # Using .item() to get the value as a Python float

# Compute MAE using PyTorch
mae = torch.mean(torch.abs(actuals - predictions))
print("MAE:", mae.item())  # Using .item() to get the value as a Python float

# Convert torch tensors to numpy arrays for sklearn compatibility to compute R-squared
actuals_np = actuals.numpy()
predictions_np = predictions.numpy()

# Compute R-squared
r_squared = r2_score(actuals_np, predictions_np)
print("R-squared:", r_squared)





Total training data points: 509
Total validation data points: 509


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (31) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=100` reached.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


tensor(0.0572)
MSE: 0.0054450491443276405
MAE: 0.05715535208582878
R-squared: 0.9056693804065246


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data.encoders import GroupNormalizer

# Assuming your future data is already loaded in future_data_frame
# Make sure to prepare this dataframe the same way you prepared your training data

# Setting up the dataset for prediction
future_dataset = TimeSeriesDataSet(
    future_data_frame,
    time_idx="time_idx",
    target="Close",
    group_ids=["group_id"],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=[],
    static_reals=[],
    time_varying_known_categoricals=[],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=["Open", "High", "Low", "Close", "Volume"],
    target_normalizer=GroupNormalizer(groups=["group_id"], transformation="log1p"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# Adjusting the number of DataLoader workers as per the system's recommendation
predict_dataloader = future_dataset.to_dataloader(train=False, batch_size=batch_size, num_workers=2)

# Predicting using the trained model
raw_predictions = tft.predict(predict_dataloader, mode="raw")

# If raw_predictions is a dictionary that contains the predictions and possibly other data, extract the predictions
# Assuming raw_predictions is structured correctly with dimensions [batch_size, n_timesteps, n_features]
predictions = raw_predictions.mean(dim=1)  # Example: Reduce along the quantile dimension if applicable

# Inverse transform the predictions if you've used any scaling/normalization
def inverse_transform(predictions, scaler):
    predictions = predictions.numpy()  # Convert predictions to numpy if they are in torch.Tensor
    return scaler.inverse_transform(predictions.reshape(-1, 5))  # Reshape if necessary and inverse transform

predicted_data = inverse_transform(predictions, scaler)

# Add the predicted data back to the future_data_frame for visualization
future_data_frame[['Pred_Open', 'Pred_High', 'Pred_Low', 'Pred_Close', 'Pred_Volume']] = predicted_data

# Plotting the results
plt.figure(figsize=(12, 6))
plt.plot(future_data_frame['DateTime'], future_data_frame['Close'], label='Actual Close')
plt.plot(future_data_frame['DateTime'], future_data_frame['Pred_Close'], label='Predicted Close', linestyle='--')
plt.title('Actual vs Predicted Close Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  self.pid = os.fork()
  self.pid = os.fork()


AttributeError: 'Output' object has no attribute 'mean'