# Crypto Network Analysis + LLM-Augmented Pipeline (Starter Notebook)

This notebook is a **drop-in scaffold** for your study:

1. Load yearly transaction graphs (edge list CSV: `src,dst,weight` optional).
2. Compute core metrics: degree distribution, top hubs, component sizes.
3. **Entity-aware** metrics via an optional address→entity mapping CSV.
4. Compare sampling policies (baseline vs. Random-Walk-with-Fly-Back, RWFB).
5. Simple **attack simulations** (remove top-k nodes/entities, re-check robustness).
6. Auto-generate a short **Results** text block you can paste into the paper.

**Files expected** (you can replace the synthetic demo below):
- `edges_YEAR.csv` with columns `src,dst` and optional `weight`.
- `entity_map.csv` with columns `address,entity` (optional).

---
If you need a custom parser (e.g., from raw block explorer exports), add it where indicated.


In [None]:
# Imports (pure-Python / stdlib + pandas/matplotlib only; no internet installs needed)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import random

pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 120)

## 0) Synthetic demo data (replace with your real edges/constants)
This creates a small directed graph with a few hubs and community structure so all cells run.

In [None]:
def make_synthetic_graph(n_communities=3, community_size=400, hub_prob=0.08, intra_p=0.01, inter_p=0.001, seed=7):
    random.seed(seed); np.random.seed(seed)
    nodes = []
    for c in range(n_communities):
        for i in range(community_size):
            nodes.append(f"c{c}_n{i}")
    hubs = set(random.sample(nodes, max(3, int(hub_prob*len(nodes)))))
    edges = []
    # intra-community edges
    for c in range(n_communities):
        comm_nodes = [n for n in nodes if n.startswith(f"c{c}_")]
        for src in comm_nodes:
            for dst in comm_nodes:
                if src != dst and np.random.rand() < intra_p:
                    edges.append((src, dst, 1.0))
    # inter-community edges
    for src in nodes:
        for dst in nodes:
            if src.split('_')[0] != dst.split('_')[0] and src != dst and np.random.rand() < inter_p:
                edges.append((src, dst, 1.0))
    # connect to hubs
    for src in nodes:
        for h in hubs:
            if src != h and np.random.rand() < 0.03:
                edges.append((src, h, 1.0))
            if src != h and np.random.rand() < 0.02:
                edges.append((h, src, 1.0))
    df = pd.DataFrame(edges, columns=["src","dst","weight"])
    return df, nodes, hubs

edges_df, nodes, hubs = make_synthetic_graph()
display(edges_df.head())
print("Nodes:", len(nodes), "Edges:", len(edges_df))

## 1) Core utilities: graph build, degree stats, components, RWFB sampling
We avoid heavyweight graph libs for portability.

In [None]:
def to_adj(edges: pd.DataFrame, directed=True):
    adj = defaultdict(list)
    for _, row in edges.iterrows():
        adj[row['src']].append((row['dst'], row.get('weight', 1.0)))
        if not directed:
            adj[row['dst']].append((row['src'], row.get('weight', 1.0)))
    return adj

def degree_stats(edges: pd.DataFrame):
    out_deg = edges.groupby('src').size()
    in_deg = edges.groupby('dst').size()
    all_nodes = set(out_deg.index) | set(in_deg.index)
    out_deg = out_deg.reindex(all_nodes).fillna(0).astype(int)
    in_deg = in_deg.reindex(all_nodes).fillna(0).astype(int)
    deg = out_deg + in_deg
    return pd.DataFrame({"in_deg": in_deg, "out_deg": out_deg, "deg": deg}).sort_values('deg', ascending=False)

class DSU:
    def __init__(self):
        self.p = {}
        self.sz = {}
    def find(self, x):
        if x not in self.p:
            self.p[x] = x; self.sz[x] = 1
        while x != self.p[x]:
            self.p[x] = self.p[self.p[x]]
            x = self.p[x]
        return x
    def union(self, a,b):
        ra, rb = self.find(a), self.find(b)
        if ra == rb: return
        if self.sz[ra] < self.sz[rb]: ra, rb = rb, ra
        self.p[rb] = ra
        self.sz[ra] += self.sz[rb]

def components_undirected(edges: pd.DataFrame):
    dsu = DSU()
    for _, r in edges.iterrows():
        dsu.union(r['src'], r['dst'])
    comp = defaultdict(list)
    for n in set(edges['src']).union(set(edges['dst'])):
        comp[dsu.find(n)].append(n)
    sizes = sorted([len(v) for v in comp.values()], reverse=True)
    return sizes, comp

def rwfb_sample(adj, start=None, steps=10000, fly_back_p=0.3):
    # Random Walk with Fly-Back over directed graph using out-edges.
    nodes = list(adj.keys())
    if start is None:
        start = random.choice(nodes)
    cur = start
    visited = []mi g
    for _ in range(steps):
        visited.append(cur)
        if random.random() < fly_back_p:
            cur = start
        else:
            nbrs = [v for v,_ in adj.get(cur, [])]
            if nbrs:
                cur = random.choice(nbrs)
            else:
                cur = start
    return Counter(visited)

adj = to_adj(edges_df, directed=True)
deg_df = degree_stats(edges_df)
sizes, comp = components_undirected(edges_df)
print("Top-degree nodes:")
display(deg_df.head(10))
print("Top component sizes:", sizes[:5])

### Plots: degree distribution (log-log)

In [None]:
deg_counts = deg_df['deg'].value_counts().sort_index()
plt.figure()
plt.scatter(deg_counts.index, deg_counts.values)
plt.xscale('log'); plt.yscale('log')
plt.title('Degree distribution (undirected degree)')
plt.xlabel('Degree'); plt.ylabel('Count')
plt.show()

## 2) Entity-aware metrics (using optional address→entity map)
Provide a CSV named `entity_map.csv` with columns `address,entity`.
We'll compute: share of edges (by count and weight) captured by the top-k entities.

In [None]:
def entity_metrics(edges: pd.DataFrame, entity_map: pd.DataFrame, topk=10):
    m = dict(zip(entity_map['address'], entity_map['entity']))
    edges2 = edges.copy()
    edges2['src_entity'] = edges2['src'].map(m).fillna(edges2['src'])
    edges2['dst_entity'] = edges2['dst'].map(m).fillna(edges2['dst'])
    # collapse to entity-level graph
    ent_edges = edges2.groupby(['src_entity','dst_entity'], as_index=False)['weight'].sum()
    ent_edges['count'] = 1
    # in/out degree at entity-level (by edge count)
    out_deg_e = ent_edges.groupby('src_entity')['count'].sum()
    in_deg_e = ent_edges.groupby('dst_entity')['count'].sum()
    ent_nodes = set(out_deg_e.index) | set(in_deg_e.index)
    out_deg_e = out_deg_e.reindex(ent_nodes).fillna(0).astype(int)
    in_deg_e = in_deg_e.reindex(ent_nodes).fillna(0).astype(int)
    deg_e = out_deg_e + in_deg_e
    deg_table = pd.DataFrame({"in": in_deg_e, "out": out_deg_e, "deg": deg_e}).sort_values('deg', ascending=False)
    # flow concentration by weight
    total_w = ent_edges['weight'].sum()
    ent_out_w = ent_edges.groupby('src_entity')['weight'].sum().sort_values(ascending=False)
    ent_in_w  = ent_edges.groupby('dst_entity')['weight'].sum().sort_values(ascending=False)
    topk_out = ent_out_w.head(topk).sum() / (total_w if total_w>0 else 1)
    topk_in  = ent_in_w.head(topk).sum() / (total_w if total_w>0 else 1)
    return deg_table, float(topk_out), float(topk_in), ent_edges

# Demo: fabricate a tiny mapping where hubs share an entity name
demo_map = pd.DataFrame({
    'address': list(hubs)[:5],
    'entity': [f'Exchange_{i}' for i in range(min(5, len(hubs)))]
})
deg_e, topk_out_share, topk_in_share, ent_edges = entity_metrics(edges_df, demo_map, topk=5)
print('Entity-level top-degree:')
display(deg_e.head(10))
print('Top-5 entity out-flow share (by weight):', round(topk_out_share, 4))
print('Top-5 entity in-flow share (by weight):', round(topk_in_share, 4))

## 3) Sampling policy comparison (RWFB vs baseline)
We compare the degree distribution stability and hub overlap when sub-sampling.

In [None]:
def sample_baseline(nodes, frac=0.3, seed=0):
    rng = np.random.default_rng(seed)
    k = max(1, int(frac * len(nodes)))
    return set(rng.choice(nodes, size=k, replace=False))

def induced_subgraph(edges: pd.DataFrame, keep_nodes: set):
    return edges[edges['src'].isin(keep_nodes) & edges['dst'].isin(keep_nodes)].copy()

def hub_overlap(full_deg: pd.Series, sub_deg: pd.Series, k=20):
    top_full = set(full_deg.sort_values(ascending=False).head(k).index)
    top_sub  = set(sub_deg.sort_values(ascending=False).head(k).index)
    return len(top_full & top_sub) / max(1, len(top_full))

full_deg_series = degree_stats(edges_df)['deg']
keep_nodes_base = sample_baseline(list(full_deg_series.index), frac=0.3, seed=1)
edges_base = induced_subgraph(edges_df, keep_nodes_base)
deg_base = degree_stats(edges_base)['deg'] if len(edges_base)>0 else pd.Series(dtype=int)
overlap_base = hub_overlap(full_deg_series, deg_base) if not deg_base.empty else 0.0

visits = rwfb_sample(adj, steps=5000, fly_back_p=0.3)
keep_nodes_rw = set(visits.keys())
edges_rw = induced_subgraph(edges_df, keep_nodes_rw)
deg_rw = degree_stats(edges_rw)['deg'] if len(edges_rw)>0 else pd.Series(dtype=int)
overlap_rw = hub_overlap(full_deg_series, deg_rw) if not deg_rw.empty else 0.0

print('Hub overlap@20 (Baseline):', round(overlap_base,3))
print('Hub overlap@20 (RWFB p=0.3):', round(overlap_rw,3))

## 4) Attack simulations (remove top-k nodes/entities)
We remove the top-k hubs or entities and recompute giant component size as a robustness proxy.

In [None]:
def remove_nodes(edges: pd.DataFrame, bad_nodes: set):
    m = ~edges['src'].isin(bad_nodes) & ~edges['dst'].isin(bad_nodes)
    return edges[m].copy()

def giant_component_size(edges: pd.DataFrame):
    sizes, _ = components_undirected(edges)
    return sizes[0] if sizes else 0

k = 20
top_nodes = list(deg_df.index[:k])
edges_k = remove_nodes(edges_df, set(top_nodes))
g0 = giant_component_size(edges_df)
gk = giant_component_size(edges_k)
print(f'Giant component | original: {g0} → after removing top-{k} nodes: {gk} (Δ={g0-gk})')

# Entity-level removal demo using demo_map
deg_e_table,_,_,ent_e = entity_metrics(edges_df, demo_map, topk=5)
top_entities = set(deg_e_table.head(5).index)
map_dict = dict(zip(demo_map['address'], demo_map['entity']))
bad_nodes_e = {addr for addr,ent in map_dict.items() if ent in top_entities}
edges_e_removed = remove_nodes(edges_df, bad_nodes_e)
g_e = giant_component_size(edges_e_removed)
print(f'Giant component | after removing top-5 entities (by mapped addresses): {g_e} (original {g0})')

## 5) Auto-generated Results blurb (paste into your paper)
Edit the wording as needed; this is a minimal narrative tied to the computed numbers.

In [None]:
tmpl = []
tmpl.append(f"We analyze a directed transaction graph comprising {len(set(edges_df['src']).union(set(edges_df['dst'])))} addresses and {len(edges_df)} edges.")
tmpl.append(f"The degree distribution exhibits a heavy-tailed shape; the top-10 nodes hold a median degree of {int(deg_df['deg'].head(10).median())}.")
tmpl.append(f"The largest connected component contains {g0} nodes, indicating substantial connectivity.")
tmpl.append(f"Under random removal of 30% of nodes (baseline subsampling), hub overlap@20 is {round(overlap_base,3)}; with RWFB (p=0.3), overlap@20 increases to {round(overlap_rw,3)}, suggesting improved stability of hub identification under walk-based sampling.")
tmpl.append(f"Mapping a subset of high-degree addresses to entities yields a top-5 entity out-flow share of {round(topk_out_share,3)} and in-flow share of {round(topk_in_share,3)}; removing these entities reduces the giant component from {g0} to {g_e} nodes, quantifying concentration and systemic impact.")
results_blurb = " ".join(tmpl)
print(results_blurb)

## 6) Next steps / TODO hooks
- Swap synthetic demo for your real `edges_YEAR.csv` and `entity_map.csv`.
- Add richer centrality (betweenness/bridging) if you provide a smaller graph slice (since exact APSP is expensive without graph libs).
- (Optional) Add **LLM orchestration**: have an agent write/run new sampling sweeps, sensitivity checks, and produce figure captions automatically.
