In [2]:
pip install polars

Collecting polarsNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/c3/47/61e7a47f77e321aa1cbf4141cc60df9d6e63b9f469c5525226535552a04c/polars-1.38.0-py3-none-any.whl.metadata
  Downloading polars-1.38.0-py3-none-any.whl.metadata (10 kB)
Collecting polars-runtime-32==1.38.0 (from polars)
  Obtaining dependency information for polars-runtime-32==1.38.0 from https://files.pythonhosted.org/packages/68/db/9bb8007a4bea76b476537740ed18c8bccd809faa390ca1443134e98f8b60/polars_runtime_32-1.38.0-cp310-abi3-win_amd64.whl.metadata
  Downloading polars_runtime_32-1.38.0-cp310-abi3-win_amd64.whl.metadata (1.5 kB)
Downloading polars-1.38.0-py3-none-any.whl (810 kB)
   ---------------------------------------- 0.0/810.1 kB ? eta -:--:--
   ------------------------ -------------- 501.8/810.1 kB 10.5 MB/s eta 0:00:01
   --------------------------------------  809.0/810.1 kB 10.2 MB/s eta 0:00:01
   -

In [6]:
import os
import glob
import polars as pl

# ==========================
# CONFIG
# ==========================
SRC = r"D:/snapshot_votes_441/spaces"
DST = r"D:/snapshot_votes_441/spaces_fixed_schema"
os.makedirs(DST, exist_ok=True)

# 你数据里常见列名（按你 pipeline）
COLS_FLOAT = ["Voting Power", "VP Ratio (%)"]
COLS_BOOL  = ["Aligned With Majority", "Is Whale"]
COLS_STR   = [
    "Space", "Proposal ID", "Proposal Title", "Proposal Body",
    "Created Time", "Voter", "Choice", "Vote Label", "Vote Timestamp"
]
COLS_INT_OPTIONAL = ["FollowersCount"]  # 有就转 int，没有就不管

# ==========================
# FIX
# ==========================
files = glob.glob(os.path.join(SRC, "space=*/", "*.parquet"))
print(f"[INFO] found parquet parts: {len(files)}")
if not files:
    raise FileNotFoundError(f"No parquet parts under {SRC}")

for i, f in enumerate(files, 1):
    # ✅ 单文件读取，不触发 schema 合并
    df = pl.read_parquet(f)

    # ---- cast floats
    for c in COLS_FLOAT:
        if c in df.columns:
            df = df.with_columns(pl.col(c).cast(pl.Float64, strict=False))

    # ---- cast bools
    for c in COLS_BOOL:
        if c in df.columns:
            df = df.with_columns(pl.col(c).cast(pl.Boolean, strict=False))

    # ---- cast strings
    for c in COLS_STR:
        if c in df.columns:
            df = df.with_columns(pl.col(c).cast(pl.Utf8, strict=False))

    # ---- cast optional ints
    for c in COLS_INT_OPTIONAL:
        if c in df.columns:
            df = df.with_columns(pl.col(c).cast(pl.Int64, strict=False))

    # ---- write to mirrored directory structure
    space_folder = os.path.basename(os.path.dirname(f))  # e.g., space=aavedao.eth
    out_dir = os.path.join(DST, space_folder)
    os.makedirs(out_dir, exist_ok=True)

    out_path = os.path.join(out_dir, os.path.basename(f))
    df.write_parquet(out_path)

    if i % 300 == 0:
        print(f"[INFO] fixed {i}/{len(files)}")

print(f"✅ DONE. Fixed schema dataset at: {DST}")




[INFO] found parquet parts: 5912
[INFO] fixed 300/5912
[INFO] fixed 600/5912
[INFO] fixed 900/5912
[INFO] fixed 1200/5912
[INFO] fixed 1500/5912
[INFO] fixed 1800/5912
[INFO] fixed 2100/5912
[INFO] fixed 2400/5912
[INFO] fixed 2700/5912
[INFO] fixed 3000/5912
[INFO] fixed 3300/5912
[INFO] fixed 3600/5912
[INFO] fixed 3900/5912
[INFO] fixed 4200/5912
[INFO] fixed 4500/5912
[INFO] fixed 4800/5912
[INFO] fixed 5100/5912
[INFO] fixed 5400/5912
[INFO] fixed 5700/5912
✅ DONE. Fixed schema dataset at: D:/snapshot_votes_441/spaces_fixed_schema


In [8]:
import os
import glob
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

# ==========================
# CONFIG
# ==========================
DATA_DIR = r"D:/snapshot_votes_441/spaces_fixed_schema"
OUT_DIR  = r"D:/snapshot_votes_441/descriptive_final"
os.makedirs(OUT_DIR, exist_ok=True)

FIG_DIR = os.path.join(OUT_DIR, "figures")
TAB_DIR = os.path.join(OUT_DIR, "tables")
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(TAB_DIR, exist_ok=True)

COL_SPACE    = "Space"
COL_PROPOSAL = "Proposal ID"
COL_VOTER    = "Voter"
COL_VP       = "Voting Power"
COL_VP_RATIO = "VP Ratio (%)"
COL_ALIGN    = "Aligned With Majority"
COL_CREATED  = "Created Time"
COL_VOTE_TS  = "Vote Timestamp"
COL_FOLLOWERS = "FollowersCount"  # optional

MIN_VOTES_PER_VOTER_FOR_BEHAVIOUR = 5

# ==========================
# Helpers
# ==========================
def save_table(df_pd: pd.DataFrame, name: str):
    path = os.path.join(TAB_DIR, f"{name}.csv")
    df_pd.to_csv(path, index=False)
    print(f"[OK] table -> {path}")

def save_fig(name: str):
    path = os.path.join(FIG_DIR, f"{name}.png")
    plt.tight_layout()
    plt.savefig(path, dpi=220)
    plt.close()
    print(f"[OK] figure -> {path}")

def gini(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    x = x[np.isfinite(x)]
    x = x[x >= 0]
    if len(x) == 0:
        return np.nan
    s = np.sum(x)
    if s == 0:
        return 0.0
    x = np.sort(x)
    n = len(x)
    cumx = np.cumsum(x)
    return float((n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n)

def top_share(x: np.ndarray, top_frac: float) -> float:
    x = np.asarray(x, dtype=float)
    x = x[np.isfinite(x)]
    x = x[x >= 0]
    if len(x) == 0:
        return np.nan
    total = np.sum(x)
    if total <= 0:
        return 0.0
    x = np.sort(x)[::-1]
    k = max(1, int(np.ceil(top_frac * len(x))))
    return float(np.sum(x[:k]) / total)

def plot_ccdf(values: np.ndarray, title: str, xlabel: str, outname: str):
    v = np.asarray(values, dtype=float)
    v = v[np.isfinite(v)]
    v = v[v > 0]
    if len(v) == 0:
        return
    v = np.sort(v)
    y = 1.0 - np.arange(1, len(v) + 1) / len(v)
    plt.figure(figsize=(7,4))
    plt.plot(v, y)
    plt.xscale("log"); plt.yscale("log")
    plt.xlabel(xlabel + " (log)")
    plt.ylabel("CCDF (log)")
    plt.title(title)
    save_fig(outname)

# ==========================
# Load (lazy)
# ==========================
parquet_files = glob.glob(os.path.join(DATA_DIR, "space=*/", "*.parquet"))
print(f"[INFO] parquet parts: {len(parquet_files)}")
if not parquet_files:
    raise FileNotFoundError(f"No parquet files found under {DATA_DIR}")

df = pl.scan_parquet(parquet_files)

# Safety filters
df = df.filter(
    pl.col(COL_SPACE).is_not_null() &
    pl.col(COL_PROPOSAL).is_not_null() &
    pl.col(COL_VOTER).is_not_null()
)

# ==========================
# 1) Dataset Overview
# ==========================
overview = df.select([
    pl.len().alias("n_votes"),
    pl.col(COL_SPACE).n_unique().alias("n_spaces"),
    pl.col(COL_PROPOSAL).n_unique().alias("n_proposals"),
    pl.col(COL_VOTER).n_unique().alias("n_voters"),
    pl.col(COL_VP).sum().alias("total_vp"),
]).collect().to_pandas()

save_table(overview, "table_1_dataset_overview")

# ==========================
# 2) Space-level stats
#    ✅ IMPORTANT: use expr.mean(), not pl.mean(expr)
# ==========================
agg_space = [
    pl.len().alias("n_votes"),
    pl.col(COL_PROPOSAL).n_unique().alias("n_proposals"),
    pl.col(COL_VOTER).n_unique().alias("n_voters"),
    pl.col(COL_VP).sum().alias("total_vp"),
    pl.col(COL_ALIGN).cast(pl.Int8).mean().alias("alignment_rate"),
]

# followers optional
if COL_FOLLOWERS in df.columns:
    agg_space.append(pl.first(COL_FOLLOWERS).alias("followersCount"))

space_stats = (
    df.group_by(COL_SPACE)
      .agg(agg_space)
      .sort("n_votes", descending=True)
      .collect()
      .to_pandas()
)
save_table(space_stats, "space_level_stats")

plt.figure(figsize=(7,4))
plt.hist(space_stats["n_votes"], bins=60, log=True)
plt.xlabel("Votes per space")
plt.ylabel("Frequency (log)")
plt.title("Distribution: votes per space (log scale)")
save_fig("fig_votes_per_space_log")

top20 = space_stats.head(20)
plt.figure(figsize=(10,5))
plt.bar(top20[COL_SPACE], top20["n_votes"])
plt.xticks(rotation=75, ha="right")
plt.ylabel("Votes")
plt.title("Top 20 spaces by vote volume")
save_fig("fig_top20_spaces_by_votes")

# ==========================
# 3) Proposal-level stats
# ==========================
proposal_stats = (
    df.group_by(COL_PROPOSAL)
      .agg([
          pl.first(COL_SPACE).alias("space"),
          pl.len().alias("n_votes"),
          pl.col(COL_VOTER).n_unique().alias("n_voters"),
          pl.col(COL_VP).sum().alias("total_vp"),
          pl.col(COL_ALIGN).cast(pl.Int8).mean().alias("alignment_rate"),
      ])
      .collect()
      .to_pandas()
)
save_table(proposal_stats.describe(include="all").reset_index().rename(columns={"index":"stat"}),
           "proposal_stats_describe")

plt.figure(figsize=(7,4))
plt.hist(proposal_stats["n_votes"], bins=80, log=True)
plt.xlabel("Votes per proposal")
plt.ylabel("Frequency (log)")
plt.title("Distribution: votes per proposal (log scale)")
save_fig("fig_votes_per_proposal_log")

plot_ccdf(proposal_stats["n_votes"].values, "Heavy-tail: votes per proposal (CCDF)",
          "Votes per proposal", "fig_votes_per_proposal_ccdf")

# ==========================
# 4) Voter-level stats
# ==========================
voter_stats = (
    df.group_by(COL_VOTER)
      .agg([
          pl.len().alias("n_votes"),
          pl.col(COL_PROPOSAL).n_unique().alias("n_proposals"),
          pl.col(COL_SPACE).n_unique().alias("n_spaces"),
          pl.col(COL_VP).sum().alias("total_vp"),
          pl.col(COL_ALIGN).cast(pl.Int8).mean().alias("alignment_rate"),
          (pl.col(COL_VP_RATIO) / 100.0).mean().alias("mean_vp_share"),
      ])
      .collect()
      .to_pandas()
)
save_table(voter_stats.describe(include="all").reset_index().rename(columns={"index":"stat"}),
           "voter_stats_describe")

plt.figure(figsize=(7,4))
plt.hist(voter_stats["n_votes"], bins=80, log=True)
plt.xlabel("Votes per voter")
plt.ylabel("Frequency (log)")
plt.title("Distribution: votes per voter (log scale)")
save_fig("fig_votes_per_voter_log")

plot_ccdf(voter_stats["n_votes"].values, "Heavy-tail: votes per voter (CCDF)",
          "Votes per voter", "fig_votes_per_voter_ccdf")

beh = voter_stats[voter_stats["n_votes"] >= MIN_VOTES_PER_VOTER_FOR_BEHAVIOUR].copy()
plt.figure(figsize=(7,4))
plt.hist(beh["alignment_rate"].dropna(), bins=50)
plt.xlabel("Alignment rate with majority")
plt.ylabel("Number of voters")
plt.title(f"Voter alignment distribution (n_votes ≥ {MIN_VOTES_PER_VOTER_FOR_BEHAVIOUR})")
save_fig("fig_voter_alignment_distribution")

# ==========================
# 5) Voting Power concentration
# ==========================
vp = voter_stats["total_vp"].to_numpy(dtype=float)
vp = vp[np.isfinite(vp)]
vp = vp[vp >= 0]

conc = pd.DataFrame([{
    "gini_total_vp": gini(vp),
    "top_1pct_share": top_share(vp, 0.01),
    "top_5pct_share": top_share(vp, 0.05),
    "top_10pct_share": top_share(vp, 0.10),
    "n_voters": int(len(vp)),
}])
save_table(conc, "table_vp_concentration")

vp_sorted = np.sort(vp)
if len(vp_sorted) > 0 and vp_sorted.sum() > 0:
    cum = np.cumsum(vp_sorted) / vp_sorted.sum()
    pop = np.arange(1, len(vp_sorted)+1) / len(vp_sorted)

    plt.figure(figsize=(6,6))
    plt.plot([0,1], [0,1])
    plt.plot(np.concatenate([[0], pop]), np.concatenate([[0], cum]))
    plt.xlabel("Cumulative share of voters")
    plt.ylabel("Cumulative share of voting power")
    plt.title("Lorenz curve of voting power")
    save_fig("fig_lorenz_voting_power")

# ==========================
# 6) Time behaviour: vote delay
# ==========================
df_time = (
    df.select([COL_CREATED, COL_VOTE_TS])
      .with_columns([
          pl.col(COL_CREATED).str.strptime(pl.Datetime, strict=False).alias("created_dt"),
          pl.col(COL_VOTE_TS).str.strptime(pl.Datetime, strict=False).alias("vote_dt"),
      ])
      .with_columns([
          (pl.col("vote_dt") - pl.col("created_dt")).dt.total_seconds().alias("delay_seconds")
      ])
      .filter(pl.col("delay_seconds").is_not_null() & (pl.col("delay_seconds") >= 0))
)

delay_pd = df_time.collect().to_pandas()
if len(delay_pd) > 0:
    delay_summary = delay_pd["delay_seconds"].describe(
        percentiles=[0.5, 0.75, 0.9, 0.95, 0.99]
    ).to_frame().reset_index()
    delay_summary.columns = ["stat", "value"]
    save_table(delay_summary, "table_vote_delay_summary")

    plt.figure(figsize=(7,4))
    plt.hist(delay_pd["delay_seconds"], bins=120, log=True)
    plt.xlabel("Seconds since proposal creation")
    plt.ylabel("Frequency (log)")
    plt.title("Distribution: vote delay (seconds, log scale)")
    save_fig("fig_vote_delay_seconds_log")

    plt.figure(figsize=(7,4))
    plt.hist(delay_pd["delay_seconds"]/3600.0, bins=120, log=True)
    plt.xlabel("Hours since proposal creation")
    plt.ylabel("Frequency (log)")
    plt.title("Distribution: vote delay (hours, log scale)")
    save_fig("fig_vote_delay_hours_log")

print("\n✅ Descriptive analysis finished.")
print(f"Tables -> {TAB_DIR}")
print(f"Figures -> {FIG_DIR}")



[INFO] parquet parts: 5912
[OK] table -> D:/snapshot_votes_441/descriptive_final\tables\table_1_dataset_overview.csv


  if COL_FOLLOWERS in df.columns:


[OK] table -> D:/snapshot_votes_441/descriptive_final\tables\space_level_stats.csv
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_votes_per_space_log.png
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_top20_spaces_by_votes.png
[OK] table -> D:/snapshot_votes_441/descriptive_final\tables\proposal_stats_describe.csv
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_votes_per_proposal_log.png
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_votes_per_proposal_ccdf.png
[OK] table -> D:/snapshot_votes_441/descriptive_final\tables\voter_stats_describe.csv
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_votes_per_voter_log.png
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_votes_per_voter_ccdf.png
[OK] figure -> D:/snapshot_votes_441/descriptive_final\figures\fig_voter_alignment_distribution.png
[OK] table -> D:/snapshot_votes_441/descriptive_final\tables\table_vp_concentration.cs