In [1]:
from src.data_utils.ingestion import BronzeIngestor
from src.data_utils.processor import SilverProcessor
from src.data_utils.builder import GoldBuilder
from src.data_utils.metadata import DatabaseInspector

from src.utils.cleanup import vacuum_database

In [2]:
db_path = "data/processed/lakehouse.sqlite"
csv_path = "data/raw/characteristics_raw.csv"

# Bronze Files

In [3]:
ingestor = BronzeIngestor(db_path, csv_path)
ingestor.run()

Ingesting data/raw/characteristics_raw.csv in chunks...
Processed chunk 1...
Processed chunk 6...
Processed chunk 11...
Processed chunk 16...
Processed chunk 21...
GKX characteristics loaded into 'bronze_characteristics'.
Starting CRSP download: 1957-01-31 to 2021-12-31...
Connection successfully made.
           date  risk_free
0    1926-07-01     0.0022
1    1926-08-01     0.0025
2    1926-09-01     0.0023
3    1926-10-01     0.0032
4    1926-11-01     0.0031
...         ...        ...
1189 2025-08-01     0.0038
1190 2025-09-01     0.0033
1191 2025-10-01     0.0037
1192 2025-11-01     0.0030
1193 2025-12-01     0.0034

[1194 rows x 2 columns]
CRSP data loaded into 'bronze_crsp'.


# Silver Files

In [4]:
processor = SilverProcessor(db_path)
processor.run_parallel()

[2026-02-11 23:47:55] [INFO] [SilverProcessor] Starting Parallel Processing (Jobs: -1)...
[19570131, 19570228, 19570329, 19570430, 19570531, 19570628, 19570731, 19570830, 19570930, 19571031, 19571129, 19571231, 19580131, 19580228, 19580331, 19580430, 19580529, 19580630, 19580731, 19580829, 19580930, 19581031, 19581128, 19581231, 19590130, 19590227, 19590331, 19590430, 19590529, 19590630, 19590731, 19590831, 19590930, 19591030, 19591130, 19591231, 19600129, 19600229, 19600331, 19600429, 19600531, 19600630, 19600729, 19600831, 19600930, 19601031, 19601130, 19601230, 19610131, 19610228, 19610330, 19610428, 19610531, 19610630, 19610731, 19610831, 19610929, 19611031, 19611130, 19611229, 19620131, 19620228, 19620330, 19620430, 19620531, 19620629, 19620731, 19620831, 19620928, 19621031, 19621130, 19621231, 19630131, 19630228, 19630329, 19630430, 19630531, 19630628, 19630731, 19630830, 19630930, 19631031, 19631129, 19631231, 19640131, 19640228, 19640331, 19640430, 19640528, 19640630, 19640731,

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 14.1min


[2026-02-12 00:05:22] [INFO] [SilverProcessor] Silver Processing Complete (Parquet).


[Parallel(n_jobs=-1)]: Done 780 out of 780 | elapsed: 17.4min finished


# Gold Files

In [5]:
builder = GoldBuilder(
    db_path = db_path, 
    silver_path = "data/processed/silver"
)
builder.run()

[2026-02-12 00:05:22] [INFO] [GoldBuilder] BUILDING GOLD PANEL AT data/processed/gold_panel...
[2026-02-12 00:05:22] [INFO] [GoldBuilder] Loading Returns for Lookup...


Processing Years: 100%|██████████| 65/65 [02:19<00:00,  2.15s/it]

[2026-02-12 00:07:53] [INFO] [GoldBuilder] GOLD PANEL PARQUET BUILD COMPLETE





# Metadata

In [6]:
inspector = DatabaseInspector(db_path)
tables = inspector.get_tables()

[2026-02-12 00:07:53] [INFO] [DB_Inspector] CONNECTED TO DB at data/processed/lakehouse.sqlite
[2026-02-12 00:07:53] [INFO] [DB_Inspector] FOUND TABLES: ['bronze_characteristics', 'bronze_crsp']


In [7]:
if 'bronze_characteristics' in tables:
    print("--- Bronze Characteristics ---")
    bronze_stats = inspector.get_table_stats('bronze_characteristics')
    print(bronze_stats)

# Inspect the final panel
if 'gold_panel' in tables:
    print("\n--- Gold Panel ---")
    gold_stats = inspector.get_table_stats('gold_panel')
    print(gold_stats)

--- Bronze Characteristics ---
[2026-02-12 00:07:59] [INFO] [DB_Inspector] ANALYZED bronze_characteristics: 4117300 rows.
{'row_count': 4117300, 'date_col': 'date', 'min_date': 19570131, 'max_date': 20211231}


In [8]:
inspector.generate_full_report()
inspector.close()

[2026-02-12 00:07:59] [INFO] [DB_Inspector] STARTING FULL METADATA GENERATION...
[2026-02-12 00:07:59] [INFO] [DB_Inspector] FOUND TABLES: ['bronze_characteristics', 'bronze_crsp']
[2026-02-12 00:08:04] [INFO] [DB_Inspector] ANALYZED bronze_characteristics: 4117300 rows.
[2026-02-12 00:08:05] [INFO] [DB_Inspector] ANALYZED bronze_crsp: 4358835 rows.
[2026-02-12 00:08:05] [INFO] [DB_Inspector] SAVED JSON REPORT: reports/metadata\db_metadata_20260212_000805.json
[2026-02-12 00:08:05] [INFO] [DB_Inspector] SAVED MARKDOWN REPORT: reports/metadata\db_metadata_20260212_000805.md
[2026-02-12 00:08:05] [INFO] [DB_Inspector] CONNECTION CLOSED


# Clean up

In [9]:
vacuum_database(db_path, drop_bronze_char=True)

[2026-02-12 00:08:05] [INFO] [DB_Cleanup] Cleaning database: data/processed/lakehouse.sqlite
[2026-02-12 00:08:05] [INFO] [DB_Cleanup] Dropped table: silver_characteristics
[2026-02-12 00:08:05] [INFO] [DB_Cleanup] Dropped table: gold_panel
[2026-02-12 00:08:05] [INFO] [DB_Cleanup] Dropped table: silver_temp
[2026-02-12 00:08:07] [INFO] [DB_Cleanup] Dropped table: bronze_characteristics
[2026-02-12 00:08:07] [INFO] [DB_Cleanup] Running VACUUM (This re-writes the DB file)...
[2026-02-12 00:08:13] [INFO] [DB_Cleanup] Database optimized. Size should be significantly reduced.
