In [9]:
# Imports
import os, json, glob, subprocess, datetime as dt
from pathlib import Path

import requests
import polars as pl
import pyarrow.dataset as ds
import matplotlib.pyplot as plt

In [10]:
# Paths on the EXTERNAL drive
PARQUET_DIR = Path("/media/vatereal/Main/parquet")
OUTPUT_DIR  = Path("/media/vatereal/Main/outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Polars display
pl.Config.set_tbl_rows(20)
pl.Config.set_fmt_str_lengths(80)

polars.config.Config

In [11]:
# RPC (HTTP first, fallback to docker exec bitcoin-cli)
RPC_URL  = "http://127.0.0.1:8332"
RPC_USER = "research"
RPC_PASS = "researchpass"

def _cid() -> str:
    try:
        return subprocess.check_output(
            ["bash","-lc","docker compose -f ~/btc-node/docker-compose.yml ps -q bitcoind"],
            text=True
        ).strip()
    except Exception:
        return ""

def rpc(method: str, params=None, timeout: int = 30):
    try:
        r = requests.post(
            RPC_URL,
            json={"jsonrpc":"1.0","id":"nb","method":method,"params":params or []},
            auth=(RPC_USER, RPC_PASS),
            timeout=timeout
        )
        r.raise_for_status()
        j = r.json()
        if j.get("error"):
            raise RuntimeError(j["error"])
        return j["result"]
    except Exception as e_http:
        cid = _cid()
        if not cid:
            raise ConnectionError(f"RPC HTTP failed and container not found: {e_http}") from e_http
        cmd = ["docker","exec","-i",cid,"bitcoin-cli",
               f"-rpcuser={RPC_USER}", f"-rpcpassword={RPC_PASS}", method]
        for p in (params or []):
            cmd.append(json.dumps(p) if isinstance(p,(dict,list)) else str(p))
        out = subprocess.check_output(cmd, text=True).strip()
        try:
            return json.loads(out)
        except Exception:
            return out

# Quick check (won't crash the notebook if RPC is down)
try:
    info = rpc("getblockchaininfo")
    print("chain:", info["chain"], "| node height:", info["blocks"])
except Exception as e:
    print("RPC check failed:", e)

RPC check failed: Command '['docker', 'exec', '-i', '866823c2d85e65e2ccea1b43d76e8d0a2e561a680dfaaf8f3c6a09c66f4c022f', 'bitcoin-cli', '-rpcuser=research', '-rpcpassword=researchpass', 'getblockchaininfo']' returned non-zero exit status 28.


error code: -28
error message:
Loading block index…


In [12]:
# Base datasets from your ETL
blocks_lf = pl.scan_parquet(str(PARQUET_DIR / "blocks/day=*/blocks-*.parquet"))
txs_lf    = pl.scan_parquet(str(PARQUET_DIR / "txs/day=*/txs-*.parquet"))
io_lf     = pl.scan_parquet(str(PARQUET_DIR / "io/day=*/io-*.parquet"))

# Prefer derived path for io_enriched if you created it; else fall back to legacy layout if present
ioe_candidates = [
    str(PARQUET_DIR / "derived/io_enriched/day=*/ioen.parquet"),
    str(PARQUET_DIR / "io_enriched/day=*/ioen-*.parquet"),
]
ioe_glob = next((g for g in ioe_candidates if glob.glob(g)), None)
ioe_lf   = pl.scan_parquet(ioe_glob) if ioe_glob else None

print("Scans ready.", " (io_enriched found)" if ioe_lf is not None else "(no io_enriched yet)")

Scans ready. (no io_enriched yet)


In [13]:
# ------------------------------------------------------------------------------
# Helpers: schema & quick counts
# ------------------------------------------------------------------------------
def show_schema(lf: pl.LazyFrame, name: str):
    schema = lf.collect_schema()
    print(f"\n## {name} schema")
    for col, dtype in schema.items():
        print(f"  {col}: {dtype}")

def quick_counts(lf: pl.LazyFrame, name: str, max_cols: int = 6):
    schema = lf.collect_schema()
    cols = list(schema.keys())[:max_cols]
    res = (
        lf.select(
            [pl.len().alias("rows")] +
            [pl.col(c).is_null().sum().alias(f"{c}__nulls") for c in cols]
        )
        .collect(engine="streaming")
    )
    print(f"\n## {name} — rows & nulls (first {len(cols)} cols)")
    display(res)

def has_columns(lf: pl.LazyFrame, *cols: str) -> bool:
    sch = lf.collect_schema()
    return all(c in sch for c in cols)

In [14]:
show_schema(blocks_lf, "blocks")
quick_counts(blocks_lf, "blocks")


## blocks schema
  height: Int64
  block_hash: String
  time: Datetime(time_unit='ns', time_zone=None)
  tx_count: Int64
  size: Int64
  weight: Int64

## blocks — rows & nulls (first 6 cols)


rows,height__nulls,block_hash__nulls,time__nulls,tx_count__nulls,size__nulls,weight__nulls
u32,u32,u32,u32,u32,u32,u32
214849,0,0,0,0,0,0


In [15]:
show_schema(txs_lf, "txs")
quick_counts(txs_lf, "txs")


## txs schema
  height: Int64
  block_hash: String
  time: Datetime(time_unit='ns', time_zone=None)
  txid: String
  hash: String
  size: Int64
  vsize: Int64
  weight: Int64
  vin_count: Int64
  vout_count: Int64

## txs — rows & nulls (first 6 cols)


rows,height__nulls,block_hash__nulls,time__nulls,txid__nulls,hash__nulls,size__nulls
u32,u32,u32,u32,u32,u32,u32
9676089,0,0,0,0,0,0


In [16]:
show_schema(io_lf, "io")
quick_counts(io_lf, "io")


## io schema
  dir: String
  height: Int64
  time: Datetime(time_unit='ns', time_zone=None)
  txid: String
  n: Int64
  prev_txid: Null
  prev_vout: Null
  address: Null
  value: Float64


SchemaError: data type mismatch for column prev_txid: incoming: String != target: Null

In [None]:
if ioe_lf is not None:
    show_schema(ioe_lf, "io_enriched")
    quick_counts(ioe_lf, "io_enriched")

In [None]:
# Blocks per day
blocks_daily = (
    blocks_lf
    .with_columns(pl.col("time").dt.date().alias("day"))
    .group_by("day")
    .agg(pl.len().alias("blocks"))
    .sort("day")
).collect(engine="streaming")

# TXs per day
txs_daily = (
    txs_lf
    .with_columns(pl.col("time").dt.date().alias("day"))
    .group_by("day")
    .agg(pl.len().alias("txs"))
    .sort("day")
).collect(engine="streaming")

display(blocks_daily.tail(10))
display(txs_daily.tail(10))

# Matplotlib plots (convert to numpy/pandas for plotting)
bd_day = blocks_daily["day"].to_pandas()
bd_cnt = blocks_daily["blocks"].to_pandas()

td_day = txs_daily["day"].to_pandas()
td_cnt = txs_daily["txs"].to_pandas()

plt.figure(figsize=(10,4))
plt.plot(bd_day, bd_cnt)
plt.title("Blocks per day"); plt.xlabel("Day"); plt.ylabel("Blocks"); plt.tight_layout(); plt.show()

plt.figure(figsize=(10,4))
plt.plot(td_day, td_cnt)
plt.title("Transactions per day"); plt.xlabel("Day"); plt.ylabel("Transactions"); plt.tight_layout(); plt.show()


In [None]:
# For huge datasets, consider restricting by day range, e.g. last N days
# io_recent = io_lf.filter(pl.col("time") >= pl.lit(pl.datetime(2013,1,1)))

if has_columns(io_lf, "address", "dir"):
    top_addrs = (
        io_lf
        .filter((pl.col("dir")=="out") & pl.col("address").is_not_null())
        .group_by("address").len()
        .sort("len", descending=True)
        .limit(25)
    ).collect(engine="streaming")

    display(top_addrs)
    top_addrs.write_csv(OUTPUT_DIR / "top_output_addresses.csv")
else:
    print("io_lf is missing 'address' or 'dir' columns (older ETL or different schema?).")


In [None]:
if has_columns(io_lf, "value", "dir", "time"):
    out_value_daily = (
        io_lf
        .filter((pl.col("dir")=="out") & pl.col("value").is_not_null())
        .with_columns(pl.col("time").dt.date().alias("day"))
        .group_by("day")
        .agg(pl.sum("value").alias("total_btc"))
        .sort("day")
    ).collect(engine="streaming")

    display(out_value_daily.tail(10))
    out_value_daily.write_csv(OUTPUT_DIR / "outputs_value_daily.csv")

    ov_day = out_value_daily["day"].to_pandas()
    ov_btc = out_value_daily["total_btc"].to_pandas()

    plt.figure(figsize=(10,4))
    plt.plot(ov_day, ov_btc)
    plt.title("Sum of output values per day (BTC)")
    plt.xlabel("Day"); plt.ylabel("BTC"); plt.tight_layout(); plt.show()
else:
    print("io_lf is missing 'value' / 'dir' / 'time' columns.")


In [None]:
SAT = 100_000_000

def script_type_expr(addr_col: str = "address") -> pl.Expr:
    """
    Classify script type from address prefix.
    Mainnet: 1 (P2PKH), 3 (P2SH), bc1q (v0 segwit), bc1p (taproot)
    Testnet: m/n (P2PKH), 2 (P2SH), tb1q (v0 segwit), tb1p (taproot)
    """
    a = pl.col(addr_col).cast(pl.Utf8)
    return (
        pl.when(a.is_null() | (a == ""))
          .then(pl.lit(None, dtype=pl.Utf8))
        .when(a.str.starts_with(("bc1p","tb1p"))).then(pl.lit("p2tr"))          # Taproot
        .when(a.str.starts_with(("bc1q","tb1q"))).then(pl.lit("v0_segwit"))     # P2WPKH/P2WSH
        .when(a.str.starts_with(("1","m","n"))).then(pl.lit("p2pkh"))           # legacy PKH
        .when(a.str.starts_with(("3","2"))).then(pl.lit("p2sh"))                # legacy SH
        .otherwise(pl.lit("other"))
    )

def ensure_outs_kv(io: pl.LazyFrame) -> pl.LazyFrame:
    """
    Keep only outputs (dir='out'), ensure numeric value, add value_sat and script_type.
    """
    return (
        io.filter(pl.col("dir") == "out")
          .with_columns([
              pl.col("value").cast(pl.Float64),
              (pl.col("value") * SAT).round(0).cast(pl.Int64).alias("value_sat"),
              script_type_expr("address").alias("script_type"),
          ])
    )

def enrich_day(lf: pl.LazyFrame) -> pl.LazyFrame:
    return lf.with_columns([
        pl.col("time").dt.date().alias("day"),
        pl.col("time").dt.year().alias("year"),
        pl.col("time").dt.month().alias("month"),
    ])


In [None]:
io_out_lf = ensure_outs_kv(io_lf)
io_out_day_lf = enrich_day(io_out_lf)

print("Enriched outputs lazyframe ready.")


In [None]:
# Top output addresses (counts). Heavy — consider filtering by date if needed.
if all(col in io_out_lf.collect_schema() for col in ("address",)):
    top_addrs = (
        io_out_lf
        .filter(pl.col("address").is_not_null())
        .group_by("address")
        .len()
        .sort("len", descending=True)
        .limit(25)
    ).collect(engine="streaming")
    display(top_addrs)
    top_addrs.write_csv(OUTPUT_DIR / "top_output_addresses.csv")
else:
    print("Enriched outputs missing 'address' column.")


  if {"address","dir"} <= set(io_lf.columns):
  ).collect(streaming=True)


ShapeError: could not create a new DataFrame: series "len" has length 1 while series "address_nc" has length 0

In [None]:
# Overall per-day totals
out_value_daily = (
    io_out_day_lf
    .group_by("day")
    .agg([
        pl.sum("value").alias("total_btc"),
        pl.sum("value_sat").alias("total_sat"),
        pl.count().alias("n_outs"),
    ])
    .sort("day")
).collect(engine="streaming")

display(out_value_daily.tail(10))
out_value_daily.write_csv(OUTPUT_DIR / "outputs_value_daily.csv")

# Plot (convert to pandas for Matplotlib)
ov_day = out_value_daily["day"].to_pandas()
ov_btc = out_value_daily["total_btc"].to_pandas()

plt.figure(figsize=(10,4))
plt.plot(ov_day, ov_btc)
plt.title("Sum of output values per day (BTC)")
plt.xlabel("Day"); plt.ylabel("BTC"); plt.tight_layout(); plt.show()

# Optional: breakdown by script_type (daily)
out_value_daily_by_type = (
    io_out_day_lf
    .group_by(["day","script_type"])
    .agg(pl.sum("value_sat").alias("total_sat"))
    .sort(["day","script_type"])
).collect(engine="streaming")

display(out_value_daily_by_type.tail(20))
out_value_daily_by_type.write_csv(OUTPUT_DIR / "outputs_value_daily_by_script_type.csv")


In [None]:
instead of loading to the home root folder on my ubuntu lets try laoding all on the remote hosting or external disk /media/vatereal/ESD-USB