In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../")
import pathlib

import pandas as pd

from utils import print_stats

df = pd.read_parquet("../processed/submissions_final.parquet")
out_dir = pathlib.Path("../graphs").mkdir(exist_ok=True, parents=True)

In [3]:
df

Unnamed: 0,image_id,unixtime,rawtime,title,total_votes,reddit_id,number_of_upvotes,subreddit,number_of_downvotes,localtime,...,is_deleted_user,score_per_1k_subs,nsfw_flag,title_sentiment,author_activity,content_type,prev_title,prev_score,title_edit_dist,score_gain
0,0,1.333172e+09,2012-03-31T12:40:39.590113-07:00,And here's a downvote.,63470.0,rmqjs,32657.0,funny,30813,1.333198e+09,...,False,,False,0.0,1,image,Unknown,0,0.000000,0
1,0,1.333178e+09,2012-03-31T14:16:01.093638-07:00,Expectation,35.0,rmun4,29.0,GifSound,6,1.333203e+09,...,False,,False,0.0,5213,gif,And here's a downvote.,1844,24.242424,-1821
2,0,1.333200e+09,2012-03-31T20:18:33.192906-07:00,Downvote,41.0,rna86,32.0,GifSound,9,1.333225e+09,...,False,,False,0.0,5213,gif,Expectation,23,21.052632,0
3,0,1.333252e+09,2012-04-01T10:52:10-07:00,Every time I downvote something,10.0,ro7e4,6.0,GifSound,4,1.333278e+09,...,False,0.131622,False,0.0,5213,gif,Downvote,23,35.897436,-21
4,0,1.333273e+09,2012-04-01T16:35:54.393381-07:00,Downvote &quot;Dies Irae&quot;,65.0,rooof,57.0,GifSound,8,1.333298e+09,...,False,,False,0.0,5213,gif,Every time I downvote something,2,36.065574,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67562,25886,1.352741e+09,2012-11-12T17:22:46+00:00,WTF TGIF,5.0,132jrv,1.0,WTF,4,1.352741e+09,...,True,-0.001333,False,-0.5,17795,image,When the product doesn't even come close to th...,248,5.405405,-251
67563,25886,1.352763e+09,2012-11-12T23:26:52+00:00,&quot;Loaded&quot; potato skins (x-post /r/mil...,391.0,1339et,341.0,ExpectationVsReality,50,1.352763e+09,...,False,18.677792,False,0.0,8,image,WTF TGIF,-3,2.898551,294
67564,25887,1.346878e+09,2012-09-06T03:53:50-07:00,Finger art [pics],900.0,zg013,763.0,woahdude,137,1.346904e+09,...,False,4.732672,False,0.0,164,image,Unknown,0,0.000000,0
67565,25887,1.346917e+09,2012-09-06T14:30:26-07:00,Found this neat style of art.,41.0,zgy3h,32.0,pics,9,1.346942e+09,...,False,0.009515,False,0.0,2,image,Finger art [pics],626,34.782609,-603


### Network artefacts & why they matter

| Graph                    | Node / Edge semantics                                                                           | Role in our analysis                                                                                                                                                        | Key stats & insight                                                                                                                     |
| ------------------------ | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| **Repost-flow**          | **Directed** A → B when the same `image_id` moves from subreddit A to B; weight = # of hops     | Our **diffusion backbone**—lets us identify **revival hubs** (high in-degree), distinguish **exporters** vs **importers**, and trace multi-hop paths (e.g. aww→funny→pics). | 817 nodes · 3 335 edges · avg degree ≈ 8.2 · clustering ≈ 0.58 ⇒ strong flow triangles (A→B→C often accompanied by A→C)                 |
| **Co-repost projection** | **Undirected** A–B if any image appears in both subs; weight = # shared images                  | Maps **community affinity** without time: perfect for **Louvain** or spectral clustering to discover thematic blocks (“animal subs,” “GIF subs,” etc.).                     | 817 nodes · 4 655 edges · clustering ≈ 0.82 · avg shortest path ≈ 2 hops ⇒ a small-world of shared content                              |
| **Repost-amplification** | **Directed**, same edges as repost-flow but with `count`, `mean_gain`, `median_gain` attributes | Separates **volume** from **reward**—uncovers low-volume, high-reward edges for recommending “where to repost next” based on expected score boost.                          | Edge count matches repost-flow but adds score-gain semantics, so we can rank hops by their typical uplift rather than just by frequency |
| **Latency-flow**         | **Directed** A → B with `n_hops`, `median_gap_h` (hours), `speed = 1/median_gap_h`              | Captures **speed of diffusion**—spots “express lanes,” contrasts fast vs slow hubs, and supplies edge-delay distributions for our temporal “half-life” models.              | Raw: ≈819 nodes · ≈670 k edges; after thresholding → ≈5 k–8 k edges, preserving only high-traffic or top-k fastest lanes                |


### Why these four graphs?

- **Repost-flow** gives us the “who passes to whom” map for every image, so we can detect revival hubs and path motifs.
- **Co-repost projection** reveals clusters of subs that **share** content, guiding community-block layouts and topic-based filters.
- **Repost-amplification** adds a **reward** dimension—identifying the edges that reliably boost score, not just move volume.
- **Latency-flow** adds a **speed** dimension—showing how quickly each edge fires, critical for modeling attention decay and half-life in our temporal chapter.


In [4]:
import networkx as nx
import pandas as pd

df_flow = df.sort_values(["image_id", "unixtime"])

edge_df = (
    df_flow.groupby("image_id")["subreddit"]
    .apply(lambda s: list(zip(s, s.shift(-1))))
    .explode()
    .dropna()
)

edge_tbl = (
    pd.DataFrame(edge_df.tolist(), columns=["src", "dst"])
    .groupby(["src", "dst"])
    .size()
    .reset_index(name="weight")
)

G_flow = nx.from_pandas_edgelist(
    edge_tbl, "src", "dst", edge_attr="weight", create_using=nx.DiGraph()
)

nx.write_graphml(G_flow, "../graphs/repost_flow.graphml")

In [5]:
image_subs = df.groupby("image_id")["subreddit"].apply(set)

edges = []
for subs in image_subs:
    subs = list(subs)
    edges += [
        (subs[i], subs[j]) for i in range(len(subs)) for j in range(i + 1, len(subs))
    ]
edge_tbl = pd.Series(edges).value_counts().reset_index(name="weight")
edge_tbl[["src", "dst"]] = pd.DataFrame(
    edge_tbl["index"].tolist(), index=edge_tbl.index
)

G_corepost = nx.from_pandas_edgelist(edge_tbl, "src", "dst", edge_attr="weight")


nx.write_graphml(G_corepost, "../graphs/corepost_projection.graphml")


In [6]:
df_flow = df.sort_values(["image_id", "unixtime"])

hops = (
    df_flow.groupby("image_id")
    .apply(
        lambda d: pd.DataFrame(
            {
                "src": d["subreddit"].iloc[:-1].values,
                "dst": d["subreddit"].iloc[1:].values,
                "gain": d["score_gain"].iloc[1:].values,
            }
        )
    )
    .reset_index(drop=True)
)

edge_gain = (
    hops.groupby(["src", "dst"])
    .agg(
        count=("gain", "size"),
        mean_gain=("gain", "mean"),
        median_gain=("gain", "median"),
    )
    .reset_index()
)

edge_gain = edge_gain[edge_gain["count"] > 0]

G_gain = nx.from_pandas_edgelist(
    edge_gain,
    "src",
    "dst",
    edge_attr=["count", "mean_gain", "median_gain"],
    create_using=nx.DiGraph(),
)

nx.write_graphml(G_gain, "../graphs/repost_amplification.graphml")


  .apply(
  hops.groupby(["src", "dst"])


In [7]:
df_time = df.sort_values(["image_id", "unixtime"])


def consecutive_rows(s):
    nxt = s.shift(-1)
    gap = nxt["unixtime"] - s["unixtime"]
    ok = gap > 0
    return pd.DataFrame(
        {
            "src": s["subreddit"][ok],
            "dst": nxt["subreddit"][ok],
            "gap_hours": gap[ok] / 3600,
        }
    )


hops = (
    df_time.groupby("image_id")
    .apply(consecutive_rows)
    .reset_index(drop=True)
    .dropna(subset=["src", "dst"])
    .loc[lambda d: d["src"] != d["dst"]]
)

edge_tbl = (
    hops[hops["src"] != hops["dst"]]
    .groupby(["src", "dst"])["gap_hours"]
    .agg(n_hops="size", median_gap_h="median")
    .reset_index()
    .assign(speed=lambda d: 1 / d["median_gap_h"])
)

edge_tbl = edge_tbl[edge_tbl.n_hops > 0]

G_latency = nx.from_pandas_edgelist(
    edge_tbl,
    "src",
    "dst",
    edge_attr=["n_hops", "median_gap_h", "speed"],
    create_using=nx.DiGraph(),
)


nx.write_graphml(G_latency, "../graphs/latency_flow.graphml")


  .apply(consecutive_rows)
  .groupby(["src", "dst"])["gap_hours"]


In [8]:
print("▶ Repost-flow graph")
print_stats(
    G_flow,
    directed=True,
)

▶ Repost-flow graph
G.number_of_nodes() = 817
G.number_of_edges() = 3335
average_degree(G) = 8.16
average_clustering(G) = 0.5847


In [9]:
print("\n▶ Co-repost projection")
print_stats(G_corepost)


▶ Co-repost projection
G.number_of_nodes() = 817
G.number_of_edges() = 4655
average_degree(G) = 11.40
average_clustering(G) = 0.8165
connectivity_perc(G) = 1.00
average_shortest_path_length_sampled(G, n_samples) = 2.0


In [10]:
print("▶ Amplification graph (score-gain weighted)")
print_stats(G_gain, directed=True)

▶ Amplification graph (score-gain weighted)
G.number_of_nodes() = 817
G.number_of_edges() = 3335
average_degree(G) = 8.16
average_clustering(G) = 0.5847


In [11]:
print("▶ Latency-flow graph")
print_stats(G_latency, directed=True)

▶ Latency-flow graph
G.number_of_nodes() = 817
G.number_of_edges() = 3264
average_degree(G) = 7.99
average_clustering(G) = 0.5847
