<a href="https://colab.research.google.com/github/UoB-DSMP-2023-24/dsmp-2024-group22/blob/main/TapesAnalysisTFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import storage
gcs = storage.Client()

In [None]:
import pandas as pd
import io

bucket_name = 'jpm-tapes'
directory_path = 'Tapes/'

bucket = gcs.get_bucket(bucket_name)

In [None]:
blobs = bucket.list_blobs(prefix=directory_path)  # Lists all the blobs in the directory

all_data_frames = []  # List to store each processed DataFrame

for blob in blobs:
    if blob.name.endswith('.csv'):  # Check if the blob is a CSV file
        # Read the content of the file
        data = blob.download_as_bytes()
        data_io = io.BytesIO(data)
        df = pd.read_csv(data_io, names=['Time', 'Price', 'Quantity'])

        # Convert 'Time' to a datetime format (assuming it's in seconds from the start of the day)
        df['DateTime'] = pd.to_datetime(df['Time'], unit='s', origin=pd.Timestamp('2025-01-01'))
        df.set_index('DateTime', inplace=True)

        # Resample and calculate OHLCV
        ohlc = df['Price'].resample('1T').ohlc()
        ohlc['Volume'] = df['Quantity'].resample('1T').sum()

        # Append the processed DataFrame to the list
        all_data_frames.append(ohlc)

# Concatenate all DataFrames into a single DataFrame
final_data_frame = pd.concat(all_data_frames)

# Display the result
print(final_data_frame.head())
print()
print(final_data_frame.tail())
print()
print(len(final_data_frame))
# Renaming the column properly if necessary
final_data_frame.columns = ['Open', 'High', 'Low', 'Close', 'Volume']


                     open  high  low  close  Volume
DateTime                                           
2025-01-01 00:00:00   267   270  252    261      68
2025-01-01 00:01:00   259   267  254    265     106
2025-01-01 00:02:00   261   269  250    266      89
2025-01-01 00:03:00   269   269  258    261      86
2025-01-01 00:04:00   261   270  258    269      78

                     open  high  low  close  Volume
DateTime                                           
2025-01-01 08:25:00   108   112  101    105     108
2025-01-01 08:26:00   108   112  104    107     112
2025-01-01 08:27:00   112   113   98     98      98
2025-01-01 08:28:00    98   112   98    109     100
2025-01-01 08:29:00   107   112  103    107     112

63750


In [None]:
!pip install pytorch-forecasting torch


Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.0.0-py3-none-any.whl (140 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/140.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m92.2/140.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.4/140.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi>=0.80 (from pytorch-forecasting)
  Downloading fastapi-0.110.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.2.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting optuna<4.0.0,>=3.1.0 (from pytorch-forecastin

In [None]:
import numpy as np
import torch
import pytorch_forecasting
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.metrics import MAE, RMSE
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score


In [None]:
import numpy as np
import pandas as pd
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import GroupNormalizer

# Ensure 'DateTime' is a column
if 'DateTime' not in final_data_frame.columns:
    final_data_frame.reset_index(inplace=True)

# Calculate 'time_idx' as minutes since the start of the dataset
final_data_frame['time_idx'] = (final_data_frame['DateTime'] - final_data_frame['DateTime'].min()).dt.total_seconds() / 60
final_data_frame['time_idx'] = final_data_frame['time_idx'].astype(int)

# Adding additional time features and convert 'day' and 'hour' to categorical type
final_data_frame['month'] = final_data_frame['DateTime'].dt.month.astype(np.int64)
final_data_frame['day'] = final_data_frame['DateTime'].dt.day.astype(str)  # Convert day to string
final_data_frame['hour'] = final_data_frame['DateTime'].dt.hour.astype(str)  # Convert hour to string

# Ensure numerical fields are float for normalization purposes
final_data_frame['Open'] = final_data_frame['Open'].astype(float)
final_data_frame['High'] = final_data_frame['High'].astype(float)
final_data_frame['Low'] = final_data_frame['Low'].astype(float)
final_data_frame['Volume'] = final_data_frame['Volume'].astype(float)
final_data_frame['Close'] = final_data_frame['Close'].astype(float)

# Define cutoff for training set using the 90th percentile of DateTime as cutoff
training_cutoff = final_data_frame['DateTime'].quantile(0.9)

# Filter the DataFrame
training_data = final_data_frame[final_data_frame['DateTime'] <= training_cutoff]
validation_data = final_data_frame[final_data_frame['DateTime'] > training_cutoff]

# Check if the training or validation data is empty
if training_data.empty:
    raise ValueError("No data available for training. Adjust the cutoff date.")
if validation_data.empty:
    raise ValueError("No data available for validation. Adjust the cutoff date or data range.")

# Check column names before fitting normalizer
print("Columns in training data:", training_data.columns)

# Fit the normalizer to training data before using it in the dataset
normalizer = GroupNormalizer(groups=['day'], transformation='softplus')

# You need to ensure the fit method receives a DataFrame with the necessary columns
normalizer.fit(training_data[['Close', 'day']])  # Fit using DataFrame that includes 'Close' and 'day'

# Creating a TimeSeriesDataSet for training
training = TimeSeriesDataSet(
    training_data,
    time_idx='time_idx',
    target='Close',
    group_ids=['day'],  # 'day' is now a string, should be treated as categorical
    max_encoder_length=60,
    max_prediction_length=15,
    static_categoricals=['day', 'hour'],
    time_varying_known_categoricals=['hour'],
    time_varying_known_reals=['Open', 'High', 'Low', 'Volume'],
    time_varying_unknown_reals=['Close'],
    target_normalizer=normalizer,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# Creating a TimeSeriesDataSet for validation
validation = TimeSeriesDataSet.from_dataset(training, validation_data, predict=True, stop_randomization=True)


Columns in training data: Index(['DateTime', 'Open', 'High', 'Low', 'Close', 'Volume', 'time_idx',
       'month', 'day', 'hour'],
      dtype='object')


TypeError: GroupNormalizer.fit() missing 1 required positional argument: 'X'

In [None]:
# Define the TFT model
pl_trainer = pytorch_forecasting.models.temporal_fusion_transformer.TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,  # Model size
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles by default
    loss=pytorch_forecasting.metrics.QuantileLoss(),
    log_interval=10,  # Log example every 10 batches
    reduce_on_plateau_patience=4,  # Reduce learning rate if no improvement in validation loss after x epochs
)

# Train the model
from pytorch_lightning import Trainer

trainer = Trainer(
    max_epochs=20,
    gpus=1 if torch.cuda.is_available() else 0,
    gradient_clip_val=0.1
)
trainer.fit(pl_trainer, train_dataloader=train_dataloader, val_dataloaders=val_dataloader)


In [None]:
# Getting predictions
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = pl_trainer.predict(val_dataloader)
mae = MAE()(predictions, actuals)
r2 = r2_score(actuals.numpy(), predictions.numpy())

print(f"Mean Absolute Error: {mae}")
print(f"R-Squared: {r2}")
