Imports

In [5]:
import boto3
import json

This initializes our cloudflare client

In [6]:
with open('ossig_r2_pull_keys', 'r') as f:
    data = f.read().splitlines()

access_key_id = data[4]
secret_access_key = data[7]
s3_endpoint = data[10]

s3 = boto3.resource(
    service_name ="s3",
    endpoint_url = s3_endpoint,
    aws_access_key_id = access_key_id,
    aws_secret_access_key = secret_access_key,
    region_name="enam", # Must be one of: wnam, enam, weur, eeur, apac, auto
)
bucket = s3.Bucket('ossig-stock-data')

Here we pull our object from memory and convert it back to a dict
- **Warning: Use this code as a block, putting code in between may have the downloaded object erased from memory**

In [7]:
pulled_json_bytes = bucket.Object("polygon-30m/NVDA/2025-03").get()
decoded_json = pulled_json_bytes['Body'].read().decode('utf-8')
decoded_dict = json.loads(decoded_json)

## Dict Structure
- The loaded dict is for a single month (as you can see in the object's name)
    - This the keys of this dictionary are the day values. **Except for the last value, this is the 'complete' value, which determines whether or not the dict has the complete month's worth of data. This should be ignored when iterating over the keys.
    - The values of this dict are sub-dictionaries
        - The sub-dictionaries have 3 different keys each: 'pre-market', 'regular-market', and 'after-hours'. These correspond to what you think they do.
        - The values of these sub-dictionaries is a list
            - This is a list of all the aggregate bars for each 30-minute timestamp of that section of market hours
                - Each element of this list is a dict
                - This dict contains
                    - ('v') the volume,
                    - ('vw') the volume-weighted average price,
                    - ('o') the open price,
                    - ('c') the close price,
                    - ('h') the highest price within this period,
                    - ('l') the lowest price within this period,
                    - ('t') the Unix millisecond timestamp for the start of the aggregate window,
                    - ('n') the number of transactions in the aggregate window

In [8]:
decoded_dict

{'2025-03-03': {'pre-market': [{'v': 813136,
    'vw': 123.882,
    'o': 124.12,
    'c': 123.9,
    'h': 125.38,
    'l': 123.05,
    't': 1740992400000,
    'n': 13293},
   {'v': 346284,
    'vw': 124.1429,
    'o': 123.94,
    'c': 124.2,
    'h': 124.5,
    'l': 123.64,
    't': 1740994200000,
    'n': 5411},
   {'v': 259593,
    'vw': 123.8658,
    'o': 124.23,
    'c': 123.82,
    'h': 124.35,
    'l': 123.55,
    't': 1740996000000,
    'n': 4323},
   {'v': 349031,
    'vw': 123.7044,
    'o': 123.82,
    'c': 123.51,
    'h': 124.2,
    'l': 123.32,
    't': 1740997800000,
    'n': 5439},
   {'v': 415594,
    'vw': 123.4927,
    'o': 123.51,
    'c': 123.7,
    'h': 123.95,
    'l': 123.11,
    't': 1740999600000,
    'n': 5849},
   {'v': 232100,
    'vw': 123.7794,
    'o': 123.74,
    'c': 123.97,
    'h': 124.1,
    'l': 123.52,
    't': 1741001400000,
    'n': 3993},
   {'v': 787023,
    'vw': 124.4441,
    'o': 123.96,
    'c': 124.62,
    'h': 124.94,
    'l': 123.8,
    

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class MarketDataset(Dataset):
    def __init__(self, decoded_dict, sessions_to_include=None, fields_to_include=None):
        if sessions_to_include is None:
            sessions_to_include = ['pre-market', 'regular-market', 'after-market']
        if fields_to_include is None:
            fields_to_include = ['v', 'vw', 'o', 'c', 'h', 'l', 't', 'n']
        
        self.samples = []

        for date, sessions in decoded_dict.items():
            if isinstance(sessions, dict):
                for session_name in sessions_to_include:
                    if session_name in sessions:
                        for record in sessions[session_name]:
                            try:
                                sample = [record[field] for field in fields_to_include]
                                self.samples.append(sample)
                            except KeyError:
                                continue


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return torch.tensor(sample, dtype=torch.float32)

if __name__ == "__main__":

    dataset = MarketDataset(
        decoded_dict,
        sessions_to_include=['regular-market'],
        fields_to_include=['o', 'h', 'l', 'c', 'v']
    )

    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

    for batch in dataloader:
        print(batch.shape)
        break

torch.Size([32, 5])


In [10]:
len(dataset) 

273

In [11]:
dataset[0]

tensor([1.2351e+02, 1.2370e+02, 1.1864e+02, 1.1900e+02, 4.9934e+07])

In [12]:
torch.save(dataset.samples, "market_dataset.pt")

In [13]:
###
import torch

all_records = []

for date, sessions in decoded_dict.items():
    if isinstance(sessions, dict):
        for session_name in ['pre-market', 'regular-market', 'after-market']:
            if session_name in sessions:
                all_records.extend(sessions[session_name])

tensor_input = []

for record in all_records:
    tensor_input.append([
        record['v'],
        record['vw'],
        record['o'],
        record['c'],
        record['h'],
        record['l'],
        record['t'],
        record['n']
    ])

tensor_data = torch.tensor(tensor_input, dtype=torch.float32)

print(tensor_data.shape)
print(tensor_data)

torch.Size([504, 8])
tensor([[8.1314e+05, 1.2388e+02, 1.2412e+02,  ..., 1.2305e+02, 1.7410e+12,
         1.3293e+04],
        [3.4628e+05, 1.2414e+02, 1.2394e+02,  ..., 1.2364e+02, 1.7410e+12,
         5.4110e+03],
        [2.5959e+05, 1.2387e+02, 1.2423e+02,  ..., 1.2355e+02, 1.7410e+12,
         4.3230e+03],
        ...,
        [1.7335e+07, 1.0770e+02, 1.0756e+02,  ..., 1.0714e+02, 1.7434e+12,
         1.3172e+05],
        [1.3097e+07, 1.0751e+02, 1.0718e+02,  ..., 1.0652e+02, 1.7434e+12,
         1.0055e+05],
        [3.0494e+07, 1.0809e+02, 1.0725e+02,  ..., 1.0713e+02, 1.7434e+12,
         2.4679e+05]])


In [14]:
import torch

data = torch.load('/Users/steven/Downloads/Numerical-Data-Handler-main/market_dataset.pt')

for i in range(50):
    print(f"Item {i}: {data[i]}")

Item 0: [123.51, 123.7, 118.64, 119, 49933730.0]
Item 1: [119.04, 119.92, 118.3, 119.51, 32366719.0]
Item 2: [119.515, 120.03, 118.51, 118.6178, 22091821.0]
Item 3: [118.61, 120.2, 118.61, 119.22, 16848897.0]
Item 4: [119.22, 119.5, 118.6, 119, 14567063.0]
Item 5: [119.01, 119.035, 118.25, 118.41, 15915920.0]
Item 6: [118.41, 118.43, 116.45, 116.81, 27160567.0]
Item 7: [116.8, 117.01, 114.51, 115.24, 37435683.0]
Item 8: [115.24, 116.66, 115.04, 116.55, 20739160.0]
Item 9: [116.54, 117.27, 115.6, 116.05, 19660712.0]
Item 10: [116.04, 117.62, 115.01, 115.1299, 22524572.0]
Item 11: [115.13, 115.17, 112.28, 113.17, 42666250.0]
Item 12: [113.16, 114.2, 112.5, 114.01, 44068581.0]
Item 13: [110.645, 115.14, 110.14, 112.911, 62345260.0]
Item 14: [112.91, 113.2, 110.11, 111.61, 39620326.0]
Item 15: [111.6, 113.3, 111.03, 112.405, 26761230.0]
Item 16: [112.3609, 112.8, 110.9, 111.83, 20225612.0]
Item 17: [111.82, 115.18, 111.82, 114.384, 26703016.0]
Item 18: [114.39, 116.6, 113.86, 116.0102, 244

In [16]:
for i, item in enumerate(data):
    print(f"Item {i}: {item}")

Item 0: [123.51, 123.7, 118.64, 119, 49933730.0]
Item 1: [119.04, 119.92, 118.3, 119.51, 32366719.0]
Item 2: [119.515, 120.03, 118.51, 118.6178, 22091821.0]
Item 3: [118.61, 120.2, 118.61, 119.22, 16848897.0]
Item 4: [119.22, 119.5, 118.6, 119, 14567063.0]
Item 5: [119.01, 119.035, 118.25, 118.41, 15915920.0]
Item 6: [118.41, 118.43, 116.45, 116.81, 27160567.0]
Item 7: [116.8, 117.01, 114.51, 115.24, 37435683.0]
Item 8: [115.24, 116.66, 115.04, 116.55, 20739160.0]
Item 9: [116.54, 117.27, 115.6, 116.05, 19660712.0]
Item 10: [116.04, 117.62, 115.01, 115.1299, 22524572.0]
Item 11: [115.13, 115.17, 112.28, 113.17, 42666250.0]
Item 12: [113.16, 114.2, 112.5, 114.01, 44068581.0]
Item 13: [110.645, 115.14, 110.14, 112.911, 62345260.0]
Item 14: [112.91, 113.2, 110.11, 111.61, 39620326.0]
Item 15: [111.6, 113.3, 111.03, 112.405, 26761230.0]
Item 16: [112.3609, 112.8, 110.9, 111.83, 20225612.0]
Item 17: [111.82, 115.18, 111.82, 114.384, 26703016.0]
Item 18: [114.39, 116.6, 113.86, 116.0102, 244

In [15]:
print(f"The list has {len(data)} items.")

The list has 273 items.
