In [306]:
from __future__ import annotations

import math
import pathlib
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from upsetplot import UpSet, from_memberships

try:
    import community as community_louvain
except ImportError:
    community_louvain = None

try:
    from lifelines import CoxPHFitter, KaplanMeierFitter
except ImportError:
    KaplanMeierFitter = CoxPHFitter = None

try:
    from rapidfuzz.distance import Levenshtein
except ImportError:
    Levenshtein = None


try:
    import community as community_louvain
except ImportError:
    community_louvain = None


In [307]:
ROOT = pathlib.Path(".").resolve().parent
GRAPHS_DIR = ROOT / "graphs"
PROC_DIR = ROOT / "processed"
FIG_DIR = ROOT / "figures" / "networks"
HTML_DIR = FIG_DIR / "html"

FIG_DIR.mkdir(exist_ok=True)
HTML_DIR.mkdir(exist_ok=True)

PLOTLY_TEMPL = "plotly_white"

In [308]:
def load_graph(name: str) -> nx.Graph:
    """Load a GraphML file and return a NetworkX graph."""
    path = GRAPHS_DIR / f"{name}.graphml"
    return nx.read_graphml(path)


def add_cluster_attribute(G: nx.Graph, resolution: float = 1.0, attr: str = "cluster"):
    """Add Louvain community IDs as a node attribute."""
    if community_louvain is None:
        raise ImportError("python-louvain is not installed in this environment.")
    partition = community_louvain.best_partition(nx.Graph(G), resolution=resolution)
    nx.set_node_attributes(G, partition, attr)
    return partition

In [309]:
def plotly_flow_map(
    html_out: pathlib.Path = HTML_DIR / "repost_flow.html",
    *,
    layout_k=0.05,  # pack nodes a bit tighter
    node_size_prop=5,  # shrink markers
    edge_opacity=0.2,  # lighten but still visible
    node_opacity=0.8,
):  # make nodes stand out
    # 2) Build edge traces
    G = load_graph("repost_flow")
    pos = nx.spring_layout(G, k=layout_k, seed=42)

    # 2) Edge trace
    edge_x, edge_y = [], []
    for u, v, attrs in G.edges(data=True):
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]
    edge_trace = go.Scatter(
        x=edge_x,
        y=edge_y,
        mode="lines",
        line=dict(color="rgba(100,100,100,{})".format(edge_opacity), width=1),
        hoverinfo="none",
    )

    # 3) Node trace
    node_x, node_y, node_text, node_deg = [], [], [], []
    for node, deg in G.degree(weight="weight"):
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_deg.append(deg)
        node_text.append(f"{node}<br>deg: {deg}")
    node_trace = go.Scatter(
        x=node_x,
        y=node_y,
        mode="markers",
        hoverinfo="text",
        text=node_text,
        marker=dict(
            showscale=True,
            colorscale="YlGnBu",
            color=node_deg,
            size=[(d**0.5) * node_size_prop for d in node_deg],
            opacity=node_opacity,
            line=dict(width=0.5, color="black"),
            colorbar=dict(title="Weighted degree"),
        ),
    )

    # 4) Build & export
    fig = go.Figure([edge_trace, node_trace])
    fig.update_layout(
        title="Repost-Flow (Plotly)",
        plot_bgcolor="white",
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        margin=dict(l=20, r=20, t=40, b=20),
    )
    fig.write_html(str(html_out), include_plotlyjs="cdn")
    print(f"✔ Wrote improved Plotly map → {html_out}")

In [310]:
def inout_scatter(plot_out: pathlib.Path = HTML_DIR / "inout_scatter.html"):
    G = load_graph("repost_flow")
    add_cluster_attribute(G)
    df = pd.DataFrame(
        {
            "subreddit": list(G.nodes()),
            "in_degree": [G.in_degree(n, weight="weight") for n in G.nodes()],
            "out_degree": [G.out_degree(n, weight="weight") for n in G.nodes()],
            "strength": [G.degree(n, weight="weight") for n in G.nodes()],
            "cluster": [G.nodes[n]["cluster"] for n in G.nodes()],
        }
    )
    fig = px.scatter(
        df,
        x="out_degree",
        y="in_degree",
        color="cluster",
        size="strength",
        hover_name="subreddit",
        template=PLOTLY_TEMPL,
        height=700,
        title="In‑ vs Out‑degree (log–log)",
    )
    fig.update_xaxes(type="log")
    fig.update_yaxes(type="log")
    fig.update_layout(legend_title_text="Community cluster")
    fig.write_html(plot_out)
    print(f"✔ Scatter → {plot_out}")

## Zipf plot of edge weights

### 1) What the plot does

Ranks every subreddit-to-subreddit edge by the number of images that traversed it and plots rank versus weight on doubly-log scales.  
_Why we usually make this plot_ – Zipf curves reveal whether a few diffusion channels dominate and how sharply traffic drops off.

### 2) What we observe in it

• Near-linear negative slope → classic heavy-tailed behaviour.  
• Top five edges transmit orders of magnitude more images than the median edge.  
• A shoulder around ranks 5–15 hints at a “core highway” of medium-volume routes.  
• Tail flattens gently; no abrupt elbow or regime change.

### 3) Insights relative to the pitch

• Validates focusing on high-capacity “express lanes” when hunting revival hubs.  
• Long tail of low-traffic edges suggests niche subs occasionally give images a second life – good material for case studies.  
• Confirms we can prune the network for visual clarity without losing explanatory power.


In [311]:
def edge_weight_zipf(plot_out: pathlib.Path = FIG_DIR / "edge_weight_zipf.png"):
    G = load_graph("repost_flow")
    weights = sorted([d["weight"] for _, _, d in G.edges(data=True)], reverse=True)
    ranks = np.arange(1, len(weights) + 1)
    plt.figure(figsize=(6, 4))
    plt.loglog(ranks, weights, marker=".")
    plt.xlabel("Edge rank (log)")
    plt.ylabel("Edge weight (log)")
    plt.title("Zipf plot of edge weights")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ Zipf plot → {plot_out}")


In [312]:
def sankey_top_images(
    *, n_images: int = 10, html_out: pathlib.Path = HTML_DIR / "sankey_images.html"
):
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet")
    top_imgs = (
        df.groupby("image_id").size().sort_values(ascending=False).head(n_images).index
    )
    paths: list[list[str]] = []
    for img in top_imgs:
        hops = df[df.image_id == img].sort_values("unixtime")["subreddit"].tolist()[:3]
        if len(hops) == 3:
            paths.append(hops)
    if not paths:
        print("⚠ No images with ≥3 hops – skip Sankey.")
        return
    nodes = sorted({s for seq in paths for s in seq})
    nidx = {s: i for i, s in enumerate(nodes)}
    link_df = (
        pd.DataFrame(
            [(nidx[a], nidx[b]) for seq in paths for a, b in zip(seq, seq[1:])],
            columns=["src", "dst"],
        )
        .assign(val=1)
        .groupby(["src", "dst"], as_index=False)
        .val.sum()
    )
    # Consistent link colour based on *destination* community bucket
    palette = px.colors.qualitative.Plotly
    link_df["color"] = link_df.dst.apply(lambda d: palette[d % len(palette)])
    node_colors = [palette[i % len(palette)] for i in range(len(nodes))]
    fig = go.Figure(
        go.Sankey(
            node=dict(label=nodes, pad=12, thickness=12, color=node_colors),
            link=dict(
                source=link_df.src,
                target=link_df.dst,
                value=link_df.val,
                color=link_df.color,
            ),
        )
    )
    fig.update_layout(
        title="Repost paths for top‑circulated images",
        template=PLOTLY_TEMPL,
        height=700,
    )
    fig.write_html(html_out)
    print(f"✔ Sankey → {html_out}")


In [313]:
def chord_diagram(
    *, out: pathlib.Path = HTML_DIR / "chord_corepost.html", thr: int = 50
):
    G = load_graph("corepost_projection")
    edges = [
        (u, v, d["weight"]) for u, v, d in G.edges(data=True) if d["weight"] >= thr
    ]
    if not edges:
        print("threshold too high – no edges")
        return

    labels = sorted({u for u, _, _ in edges} | {v for _, v, _ in edges})
    n = len(labels)
    idx = {l: i for i, l in enumerate(labels)}
    matrix = np.zeros((n, n))
    for u, v, w in edges:
        i, j = idx[u], idx[v]
        matrix[i, j] = matrix[j, i] = w  # ensure symmetry

    if hasattr(go, "Chord"):
        fig = go.Figure(go.Chord(labels=labels, matrix=matrix.tolist()))
    else:
        src = [idx[u] for u, v, _ in edges]
        dst = [idx[v] for u, v, _ in edges]
        val = [w for _, _, w in edges]
        fig = go.Figure(
            go.Sankey(
                arrangement="fixed",
                node=dict(label=labels, pad=15, thickness=10),  # pad enlarged
                link=dict(source=src, target=dst, value=val),
            )
        )

    fig.update_layout(
        title=f"Co‑repost diagram (≥{thr} shared images)",
        template=PLOTLY_TEMPL,
        height=700,
    )
    fig.write_html(out)
    print(f"✔ Chord → {out}")


## UpSet plot – multi-subreddit image paths

### 1) What the plot does

UpSet visualisation of the most frequent repost paths: top bar = number of images following a specific path; dot-matrix = which subreddits form that path.  
_Why we usually make this plot_ – UpSet handles many-set intersections cleanly, exposing popular multi-hop trajectories without overplotting.

### 2) What we observe in it

• r/funny, r/pics and r/aww dominate the tallest bars – they are the final destination for most recycled images.  
• Most paths are two hops; longer chains drop sharply in frequency.  
• Common pattern: niche or topic sub → general-interest sub → mass-appeal hub.  
• Directionality is clear – paths rarely flow back to the niche origin.  
• Even rare, tail-end paths converge on the same revival hubs, underscoring their gravitational pull.

### 3) Insights relative to the pitch

• Empirically demonstrates the “promotion ladder” from niche to mainstream communities.  
• Reinforces that changing _where_ an image is posted can be as powerful as renaming it.  
• Provides concrete trajectories we can animate or narrate in the final story, linking the static and network chapters.


In [314]:
def upset_corepost(
    plot_out: pathlib.Path = FIG_DIR / "upset_corepost.png", top_k: int = 12
):
    G = load_graph("corepost_projection")
    strength = {n: G.degree(n, weight="weight") for n in G.nodes()}
    top_subs = [
        n
        for n, _ in sorted(strength.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
    ]
    imgdf = pd.read_parquet(PROC_DIR / "submissions_final.parquet")[
        ["image_id", "subreddit"]
    ]
    memberships = (
        imgdf[imgdf.subreddit.isin(top_subs)]
        .groupby("image_id")["subreddit"]
        .apply(list)
        .tolist()
    )
    upset = from_memberships(memberships)
    plt.figure(figsize=(10, 5))
    UpSet(upset, subset_size="count", show_counts=True).plot()
    plt.suptitle("UpSet – intersections across top subs")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ UpSet → {plot_out}")


## Block-model adjacency heat-maps (log1p weights)

### 1) What the plot does

Four zoomed adjacency matrices of the repost-flow graph, reordered by community detection (Louvain / SBM) and coloured by the log of edge weight.  
_Why we usually make this plot_ – to visually confirm modular structure and spot dense intra-cluster vs sparse inter-cluster connections.

### 2) What we observe in them

• Dark squares along the diagonal show strong reposting within thematic clusters.  
• Thin bright stripes bridging blocks highlight high-traffic cross-community links.  
• Blocks vary in density and size – some subs act as large generalist hubs, others as tight niche clusters.  
• Occasional bright pixels outside any block reveal individual “wormholes” (e.g., r/funny → a niche sub) that shortcut the hierarchy.

### 3) Insights relative to the pitch

• Supports the idea that revival mostly happens _inside_ communities before an image jumps outward.  
• Cross-block stripes identify prime export hubs for the amplification chapter.  
• No obvious artefacts – block structure appears data-driven, so later path analyses rest on solid ground.


In [315]:
def blockmodel_heatmap(plot_out: pathlib.Path = FIG_DIR / "block_heatmap.png"):
    G = load_graph("corepost_projection")
    part = add_cluster_attribute(G)
    order = sorted(G.nodes(), key=lambda n: (part[n], G.degree(n, weight="weight")))
    idx = {n: i for i, n in enumerate(order)}
    mat = np.zeros((len(order), len(order)))
    for u, v, d in G.edges(data=True):
        i, j = idx[u], idx[v]
        mat[i, j] = mat[j, i] = math.log1p(d["weight"])
    plt.figure(figsize=(8, 6))
    sns.heatmap(mat, cmap="mako_r", xticklabels=False, yticklabels=False)
    plt.title("Block‑model heat‑map (log1p weights)")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ Heat‑map → {plot_out}")


In [316]:
def sunburst_communities(
    html_out: pathlib.Path = HTML_DIR / "sunburst_communities.html",
):
    G = load_graph("corepost_projection")
    partition = add_cluster_attribute(G)
    df = pd.DataFrame(
        {
            "subreddit": list(partition.keys()),
            "cluster": list(partition.values()),
            "strength": [G.degree(n, weight="weight") for n in partition],
        }
    )
    df["all"] = "All"
    fig = px.sunburst(
        df,
        path=["all", "cluster", "subreddit"],
        values="strength",
        template=PLOTLY_TEMPL,
        title="Community sunburst",
        height=700,
    )
    fig.write_html(html_out)
    print(f"✔ Sunburst → {html_out}")

In [317]:
def freq_vs_median_gain_scatter(out: pathlib.Path = HTML_DIR / "freq_vs_gain.html"):
    df = pd.DataFrame(
        [
            {**d, "src": u, "dst": v}
            for u, v, d in load_graph("repost_amplification").edges(data=True)
        ]
    )
    df["count"] = pd.to_numeric(df["count"], errors="coerce")
    df["median_gain"] = pd.to_numeric(df["median_gain"], errors="coerce")
    df["mean_gain"] = pd.to_numeric(df["mean_gain"], errors="coerce")
    size_col = df.mean_gain.abs() + 1
    fig = px.scatter(
        df,
        x="count",
        y="median_gain",
        size=size_col,
        color=(df.median_gain > 0),
        hover_data=["src", "dst", "mean_gain"],
        template=PLOTLY_TEMPL,
        height=700,
        title="Edge frequency vs. median karma gain (marker size ∝ |mean_gain|)",
    )
    fig.update_xaxes(type="log")
    fig.write_html(out)
    print(f"✔ Freq‑vs‑gain scatter → {out}")

## Distribution of image resubmissions

### 1) What the plot does

A histogram of how often each image re-appears in the dataset, with counts shown on a logarithmic scale.  
_Why we usually make this plot_ – to check whether content popularity is narrowly concentrated or has a long-tailed “viral” spread.

### 2) What we observe in it

• A dominant spike at “single-use” images – most pictures are never reposted.  
• A smooth, heavy-tailed decline stretching past 150 resubmissions.  
• No clear secondary hump, indicating reposting frequency is continuous rather than bimodal.  
• Only a handful of extreme outliers (“evergreens”) account for the far right tail.

### 3) Insights relative to the pitch

• Confirms the _“one image, many faces”_ framing – a tiny minority fuels the recycling story.  
• Suggests revival hubs are choosy: they amplify just a sliver of all content.  
• Provides a baseline for later survival-curve work (temporal chapter).


In [318]:
def resubmission_hist(out=FIG_DIR / "resubmission_hist.png"):
    cnt = (
        pd.read_parquet(PROC_DIR / "submissions_final.parquet")
        .groupby("image_id")
        .size()
    )
    plt.figure(figsize=(6, 4))
    sns.histplot(cnt, bins=50, log_scale=(False, True))
    plt.xlabel("# resubmissions per image")
    plt.ylabel("Images (log)")
    plt.title("Distribution of image resubmissions")
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()


In [None]:
ALL_FUNCS = [
    plotly_flow_map,
    inout_scatter,
    edge_weight_zipf,
    sankey_top_images,
    chord_diagram,
    upset_corepost,
    blockmodel_heatmap,
    sunburst_communities,
    freq_vs_median_gain_scatter,
    resubmission_hist,
]


def run_all():
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        start = datetime.now()
        for fn in ALL_FUNCS:
            fname = fn.__name__
            try:
                print(f"→ {fname}()")
                if fname == "sankey_top_images":
                    fn(n_images=10)
                else:
                    fn()
            except Exception as e:
                print(f"⚠ {fname} failed: {e}")
        print(
            f"Completed in {datetime.now() - start} – outputs in {FIG_DIR} & {HTML_DIR}"
        )

In [320]:
run_all()

→ plotly_flow_map()
✔ Wrote improved Plotly map → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\html\repost_flow.html
→ inout_scatter()
✔ Scatter → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\html\inout_scatter.html
→ edge_weight_zipf()
✔ Zipf plot → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\edge_weight_zipf.png
→ sankey_top_images()
✔ Sankey → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\html\sankey_images.html
→ chord_diagram()
✔ Chord → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\html\chord_corepost.html
→ upset_corepost()
✔ UpSet → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\upset_corepost.png
→ blockmodel_heatmap()
✔ Heat‑map → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\block_heatmap.png
→ sunburst_communities()
✔ Sunburst → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\networks\html\sunburst_communitie

<Figure size 1000x500 with 0 Axes>