In [None]:
import os
import random
import tarfile
import shutil
import polars as pl
import pandas as pd

In [None]:
full_analysis = False

In [None]:
# Paths to where the original tar files are stored
data_dir_bbo   = "SP500_2010/bbo"
data_dir_trade = "SP500_2010/trade"


extracted_base_dir = "SP500_2010_extracted"

if not os.path.exists(extracted_base_dir):
    os.makedirs(extracted_base_dir)

# Gather up all .tar files
bbo_tar_files = [
    os.path.join(data_dir_bbo, f)
    for f in os.listdir(data_dir_bbo)
    if f.endswith(".tar")
]

trade_tar_files = [
    os.path.join(data_dir_trade, f)
    for f in os.listdir(data_dir_trade)
    if f.endswith(".tar")
]

print("BBO tar files found:", len(bbo_tar_files))
print("Trade tar files found:", len(trade_tar_files))

if not full_analysis:
    # randomly sample 10 files
    bbo_tar_files = random.sample(bbo_tar_files, 10)
    trade_tar_files = random.sample(trade_tar_files, 10)

print("BBO tar files to process:", len(bbo_tar_files))
print("Trade tar files to process:", len(trade_tar_files))

if os.path.exists("SP500_2010"):
    shutil.rmtree("SP500_2010")

In [None]:

def extract_tar_to_ticker_subfolder(tar_path, parent_out_dir, subfolder_name):
    """
    Extracts a .tar file into the directory structure:
       parent_out_dir / <TICKER> / <subfolder_name> / <extracted CSV files>
    
    The TICKER is inferred from the tar file name (before .tar).
    The subfolder_name is either 'bbo' or 'trade' (depending on the source).
    """
    
    base_name = os.path.splitext(os.path.basename(tar_path))[0]
    
    
    ticker = base_name.split("-")[0]
    
    # Create the directory for that ticker
    ticker_dir = os.path.join(parent_out_dir, ticker)
    if not os.path.exists(ticker_dir):
        os.makedirs(ticker_dir)
    
    # Then within the ticker folder, create a subfolder named either "bbo" or "trade"
    output_dir = os.path.join(ticker_dir, subfolder_name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Extract the .tar
    with tarfile.open(tar_path, "r") as tar:
        for member in tar.getmembers():
            if member.isfile():
                # Avoid preserving deep internal paths by stripping them
                member.name = os.path.basename(member.name)
                tar.extract(member, output_dir)
    
    print(f"Extracted {tar_path} -> {output_dir}")


# Extract BBO tar files
for tar_file in bbo_tar_files:
    extract_tar_to_ticker_subfolder(
        tar_path=tar_file,
        parent_out_dir=extracted_base_dir,
        subfolder_name="bbo"
    )

# Extract TRADE tar files
for tar_file in trade_tar_files:
    extract_tar_to_ticker_subfolder(
        tar_path=tar_file,
        parent_out_dir=extracted_base_dir,
        subfolder_name="trade"
    )

In [None]:
extracted_dir = "SP500_2010_extracted"

for root, dirs, files in os.walk(extracted_dir):
    for file in files:
        if file.endswith(".parquet"):
            parquet_path = os.path.join(root, file)
            
            # 1) Read the parquet file into a pandas DataFrame
            df = pd.read_parquet(parquet_path)
            
            # 2) Construct the CSV name. For example, if the file is
            
            csv_name = file.replace(".parquet", ".csv")
            csv_path = os.path.join(root, csv_name)
            
            # 3) Save to CSV
            df.to_csv(csv_path, index=False)
            
            print(f"Converted {parquet_path} -> {csv_path}")
            
            
            os.remove(parquet_path)

In [None]:
# Excel's day offset to 1970-01-01
EXCEL_EPOCH_OFFSET = 25569
SECONDS_PER_DAY    = 86400

# For May 2010, NY is UTC-4 (DST).
UTC_OFFSET_HOURS   = 4  # subtract 4 hours

# Directory containing your .csv files
base_dir = "SP500_2010_extracted"

for root, dirs, files in os.walk(base_dir):
    for filename in files:
        if filename.endswith(".csv"):
            csv_path = os.path.join(root, filename)
            
            # 1) Read with Polars
            df = pl.read_csv(csv_path)

            # 2) If "xltime" is present, convert it to local NY time
            #    (one column: 'ny_timestamp') and drop "xltime"
            if "xltime" in df.columns:
                # A) Convert from Excel days -> naive UTC
                df = df.with_columns(
                    (
                        (pl.col("xltime") - EXCEL_EPOCH_OFFSET)
                        * SECONDS_PER_DAY
                        * 1_000_000_000
                    )
                    .cast(pl.Int64)
                    .cast(pl.Datetime("ns"))
                    .alias("ny_timestamp")  # we’ll shift this below
                ).drop("xltime")

                # B) Shift by 4 hours to approximate EDT
                #    ny_timestamp = ny_timestamp - 4h
                df = df.with_columns(
                    (
                        pl.col("ny_timestamp").cast(pl.Int64)
                        - (UTC_OFFSET_HOURS * 3600 * 1_000_000_000)
                    )
                    .cast(pl.Datetime("ns"))
                    .alias("ny_timestamp")
                )

            # 3) Overwrite the same CSV
            df.write_csv(csv_path)
            print(f"Overwrote {csv_path} with local 'ny_timestamp' column.")


In [8]:
for root, dirs, files in os.walk(base_dir):
    for filename in files:
        if filename.endswith(".csv"):
            csv_path = os.path.join(root, filename)

            # 1) Read with Polars
            df = pl.read_csv(csv_path)
            
            # 2) Drop utc_timestamp if present
            if "utc_timestamp" in df.columns:
                df = df.drop("utc_timestamp")
                
                # 3) Overwrite the same CSV file
                df.write_csv(csv_path)
                print(f"Overwrote {csv_path} without utc_timestamp.")