<a href="https://colab.research.google.com/github/UoB-DSMP-2023-24/dsmp-2024-group22/blob/main/TapesAnalysisTFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from google.colab import auth
auth.authenticate_user()

In [11]:
from google.cloud import storage
gcs = storage.Client()

In [12]:
import pandas as pd
import io

bucket_name = 'jpm-tapes'
directory_path = 'Tapes/'

bucket = gcs.get_bucket(bucket_name)

In [13]:
blobs = bucket.list_blobs(prefix=directory_path)  # Lists all the blobs in the directory

all_data_frames = []  # List to store each processed DataFrame

for blob in blobs:
    if blob.name.endswith('.csv'):  # Check if the blob is a CSV file
        # Read the content of the file
        data = blob.download_as_bytes()
        data_io = io.BytesIO(data)
        df = pd.read_csv(data_io, names=['Time', 'Price', 'Quantity'])

        # Convert 'Time' to a datetime format (assuming it's in seconds from the start of the day)
        df['DateTime'] = pd.to_datetime(df['Time'], unit='s', origin=pd.Timestamp('2025-01-01'))
        df.set_index('DateTime', inplace=True)

        # Resample and calculate OHLCV
        ohlc = df['Price'].resample('1T').ohlc()
        ohlc['Volume'] = df['Quantity'].resample('1T').sum()

        # Reset the index to make DateTime a column
        ohlc.reset_index(inplace=True)

        # Append the processed DataFrame to the list
        all_data_frames.append(ohlc)

# Concatenate all DataFrames into a single DataFrame
final_data_frame = pd.concat(all_data_frames)

# Display the result
print(final_data_frame.head())
print()
print(final_data_frame.tail())
print()
print(len(final_data_frame))
# Renaming the column properly if necessary
final_data_frame.columns = ['DateTime', 'Open', 'High', 'Low', 'Close', 'Volume']

             DateTime  open  high  low  close  Volume
0 2025-01-01 00:00:00   267   270  252    261      68
1 2025-01-01 00:01:00   259   267  254    265     106
2 2025-01-01 00:02:00   261   269  250    266      89
3 2025-01-01 00:03:00   269   269  258    261      86
4 2025-01-01 00:04:00   261   270  258    269      78

               DateTime  open  high  low  close  Volume
505 2025-01-01 08:25:00   108   112  101    105     108
506 2025-01-01 08:26:00   108   112  104    107     112
507 2025-01-01 08:27:00   112   113   98     98      98
508 2025-01-01 08:28:00    98   112   98    109     100
509 2025-01-01 08:29:00   107   112  103    107     112

63750


In [14]:
!pip install pytorch-forecasting pytorch_lightning



In [None]:
import pandas as pd
import torch
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data.encoders import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
import lightning.pytorch as pl # Instead of import pytorch_lightning as pl

from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import LearningRateMonitor
from sklearn.preprocessing import MinMaxScaler

# Data Preparation
final_data_frame['DateTime'] = pd.to_datetime(final_data_frame['DateTime'])
final_data_frame = final_data_frame.drop_duplicates(subset=['DateTime']).reset_index(drop=True)
final_data_frame['time_idx'] = (final_data_frame['DateTime'] - final_data_frame['DateTime'].min()).dt.total_seconds().astype(int)
final_data_frame['time_idx'] //= 60  # Convert from seconds to minutes
final_data_frame['group_id'] = 'stock_price'  # Adding a constant group_id for simplicity
scaler = MinMaxScaler()
final_data_frame[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(final_data_frame[['Open', 'High', 'Low', 'Close', 'Volume']])

# TimeSeriesDataSet Setup
max_prediction_length = 5
max_encoder_length = 20
training_cutoff = final_data_frame['time_idx'].max() - max_prediction_length

training = TimeSeriesDataSet(
    final_data_frame[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="Close",
    group_ids=["group_id"],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=[],
    static_reals=[],
    time_varying_known_categoricals=[],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=["Open", "High", "Low", "Close", "Volume"],
    target_normalizer=GroupNormalizer(groups=["group_id"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# DataLoader for Training and Validation
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=4)
val_dataloader = training.to_dataloader(train=False, batch_size=batch_size, num_workers=4)

# PyTorch Lightning Trainer Configuration
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=10)

# Temporal Fusion Transformer Model Initialization
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # Predicting 7 quantiles
    loss=QuantileLoss(),
    log_interval=10,
    reduce_on_plateau_patience=4,
)

# Training the Model
trainer.fit(tft, train_dataloader, val_dataloader)

# After prediction, ensure tensors are on the same device
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)]).cpu()
predictions = tft.predict(val_dataloader).cpu()

# Now compute the mean absolute error
print((actuals - predictions).abs().mean())
