In [1]:
from src.data_utils.ingestion import BronzeIngestor
from src.data_utils.processor import SilverProcessor
from src.data_utils.builder import GoldBuilder

from src.utils.cleanup import vacuum_database

In [4]:
db_path = "data/processed/lakehouse.sqlite"
csv_path = "data/raw/characteristics_raw.csv"

# Bronze Files

In [3]:
ingestor = BronzeIngestor(db_path, csv_path)
ingestor.run()

Ingesting data/raw/characteristics_raw.csv in chunks...
Processed chunk 1...
Processed chunk 6...
Processed chunk 11...
Processed chunk 16...
Processed chunk 21...
Processed chunk 26...
Processed chunk 31...
Processed chunk 36...
Processed chunk 41...
GKX characteristics loaded into 'bronze_characteristics'.
Starting CRSP download: 1957-01-31 to 2021-12-31...
CRSP data loaded into 'bronze_crsp'.


# Silver Files

In [5]:
processor = SilverProcessor(db_path)
processor.run_parallel()

[2026-02-06 16:19:12] [INFO] [SilverProcessor] Starting Parallel Processing (Jobs: -1)...
[2026-02-06 16:19:16] [INFO] [SilverProcessor] Found 780 months to process.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 14.8min


[2026-02-06 16:38:22] [INFO] [SilverProcessor] Silver Processing Complete (Parquet).


[Parallel(n_jobs=-1)]: Done 780 out of 780 | elapsed: 19.1min finished


# Gold Files

In [2]:
builder = GoldBuilder(
    db_path="data/processed/lakehouse.sqlite", 
    silver_path="data/processed/silver"
)
builder.run()

[2026-02-06 18:06:54] [INFO] [GoldBuilder] BUILDING GOLD PANEL AT data/processed/gold_panel...
[2026-02-06 18:06:54] [INFO] [GoldBuilder] Loading Returns for Lookup...


Processing Years: 100%|██████████| 65/65 [02:18<00:00,  2.13s/it]

[2026-02-06 18:09:20] [INFO] [GoldBuilder] GOLD PANEL PARQUET BUILD COMPLETE





# Clean up

In [5]:
vacuum_database(db_path, drop_bronze_char=False)

[2026-02-06 18:14:20] [INFO] [DB_Cleanup] Cleaning database: data/processed/lakehouse.sqlite
[2026-02-06 18:14:35] [INFO] [DB_Cleanup] Dropped table: silver_characteristics
[2026-02-06 18:14:47] [INFO] [DB_Cleanup] Dropped table: gold_panel
[2026-02-06 18:14:47] [INFO] [DB_Cleanup] Dropped table: silver_temp
[2026-02-06 18:14:47] [INFO] [DB_Cleanup] Running VACUUM (This re-writes the DB file)...
[2026-02-06 18:16:38] [INFO] [DB_Cleanup] ✅ Database optimized. Size should be significantly reduced.
