In [1]:
import polars as pl
import tarfile
import os
import datetime as dt
from pathlib import Path
from tqdm import tqdm

from src.preprocessing import preprocess_all_tickers

# Autorealod extension for Jupyter Notebooks
%load_ext autoreload
%autoreload 2

In [2]:
df_amzn = pl.read_parquet("data/AMZN.OQ/2016-01-04-AMZN.OQ-bbo.parquet")
df_amzn

xltime,bid-price,bid-volume,ask-price,ask-volume
f64,f64,i32,f64,i32
42373.375006,0.0,0,669.8,1
42373.375014,286.0,1,669.8,1
42373.375616,286.0,1,0.0,0
42373.380269,286.0,1,695.0,1
42373.380503,286.0,1,0.0,0
…,…,…,…,…
42373.953085,637.0,1,638.75,1
42373.96403,637.0,1,638.1,1
42373.96403,637.0,1,638.75,1
42373.984551,638.5,2,638.75,1


In [17]:
preprocess_all_tickers()

Preprocessing tickers:   0%|          | 0/9 [00:00<?, ?it/s]

Error preprocessing XOM.OQ: No parquet files found for XOM.OQ
Error preprocessing BRK.B.OQ: No parquet files found for BRK.B.OQ
Error preprocessing FOX.N: No parquet files found for FOX.N
Error preprocessing KHC.N: No parquet files found for KHC.N
Error preprocessing TXN.N: No parquet files found for TXN.N
Error preprocessing ABBV.OQ: No parquet files found for ABBV.OQ
Error preprocessing WBA.N: No parquet files found for WBA.N
Error preprocessing FB.N: No parquet files found for FB.N
Error preprocessing ORCL.OQ: No parquet files found for ORCL.OQ
93 tickers preprocessed successfully over 102 attempted.


In [8]:
df_1 = pl.read_parquet("data/preprocessed/SP100/bbo/AGN.N.parquet")
df_1

timestamp,mid_price,mid_price_return
"datetime[μs, America/New_York]",f64,f64
2015-01-02 09:33:00 EST,213.905,0.001498
2015-01-02 09:34:00 EST,213.97,0.000304
2015-01-02 09:35:00 EST,214.33,0.001682
2015-01-02 09:36:00 EST,214.28,-0.000233
2015-01-02 09:37:00 EST,214.335,0.000257
…,…,…
2017-03-31 15:55:00 EDT,239.62,0.000397
2017-03-31 15:56:00 EDT,239.445,-0.00073
2017-03-31 15:57:00 EDT,239.46,0.000063
2017-03-31 15:58:00 EDT,239.565,0.000438


In [18]:
df_2 = pl.read_parquet("data/preprocessed/SP100/bbo/AAPL.OQ.parquet")
df_2

timestamp,mid_price,mid_price_return
"datetime[μs, America/New_York]",f64,f64
2015-01-02 09:31:00 EST,111.295,0.000404
2015-01-02 09:32:00 EST,111.29,-0.000045
2015-01-02 09:33:00 EST,111.175,-0.001033
2015-01-02 09:34:00 EST,111.05,-0.001124
2015-01-02 09:35:00 EST,111.22,0.001531
…,…,…
2017-04-12 15:55:00 EDT,141.685,-0.000141
2017-04-12 15:56:00 EDT,141.725,0.000282
2017-04-12 15:57:00 EDT,141.695,-0.000212
2017-04-12 15:58:00 EDT,141.705,0.000071


In [25]:
import polars as pl
from pathlib import Path

folder = Path("data/preprocessed/SP100/bbo")
files = list(folder.glob("*.parquet"))
problematic_files = 0

# Expected number of rows per day
# From 9:31 to 15:59 inclus, il y a (15-9)*60 + (59-31+1) = 389 minutes
EXPECTED_ROWS_PER_DAY = 389

for f in files:
    df = pl.read_parquet(f, columns=["timestamp"])
    # Ensure timestamp is in datetime format
    df = df.with_columns(pl.col("timestamp").dt.date().alias("date"))

    # Count rows per day
    rows_per_day = df.group_by("date").len().rename({"len": "num_rows"})
    overmatched_days = rows_per_day.filter(pl.col("num_rows") > EXPECTED_ROWS_PER_DAY)
    undermatched_days = rows_per_day.filter(pl.col("num_rows") < EXPECTED_ROWS_PER_DAY)

    if overmatched_days.height > 0 or undermatched_days.height > 0:
        print(f"File {f.name} has {overmatched_days.height} days with extra rows and {undermatched_days.height} days with missing rows")
        problematic_files += 1
    else:
        print(f"File {f.name} is fine (all days have {EXPECTED_ROWS_PER_DAY} rows).")

# print num of problematic files
print(f"Total problematic files: {problematic_files} out of {len(files)}")



File EXC.N.parquet has 2 days with extra rows and 143 days with missing rows
File MA.N.parquet has 2 days with extra rows and 172 days with missing rows
File GOOG.OQ.parquet has 3 days with extra rows and 93 days with missing rows
File GOOGL.OQ.parquet has 1 days with extra rows and 72 days with missing rows
File PEP.N.parquet has 0 days with extra rows and 185 days with missing rows
File AMGN.OQ.parquet has 10 days with extra rows and 12 days with missing rows
File BK.N.parquet has 2 days with extra rows and 197 days with missing rows
File F.N.parquet has 2 days with extra rows and 223 days with missing rows
File C.N.parquet has 3 days with extra rows and 228 days with missing rows
File CVS.N.parquet has 1 days with extra rows and 171 days with missing rows
File VZ.N.parquet has 5 days with extra rows and 152 days with missing rows
File DIS.N.parquet has 2 days with extra rows and 219 days with missing rows
File JNJ.N.parquet has 0 days with extra rows and 172 days with missing rows
F

In [26]:
overmatched_days

date,num_rows
date,u32
2017-02-21,390
2017-01-13,390
2017-03-02,390
2016-12-09,390
2016-02-04,390
2016-11-22,390
2017-02-08,390


In [29]:
df_AMZN = pl.read_parquet("data/preprocessed/SP100/bbo/AMZN.OQ.parquet")
# select rows where date is 2016-11-22
df_AMZN.filter(pl.col("timestamp").dt.date() == dt.date(2016, 11, 22))


timestamp,mid_price,mid_price_return
"datetime[μs, America/New_York]",f64,f64
2016-11-22 09:31:00 EST,786.545,-0.000267
2016-11-22 09:32:00 EST,786.545,-1.4454e-16
2016-11-22 09:33:00 EST,790.485,0.005009
2016-11-22 09:34:00 EST,791.515,0.001303
2016-11-22 09:35:00 EST,790.23,-0.001623
…,…,…
2016-11-22 15:56:00 EST,785.565,-0.000356
2016-11-22 15:57:00 EST,785.88,0.000401
2016-11-22 15:58:00 EST,786.025,0.000185
2016-11-22 15:59:00 EST,785.53,-0.00063
