In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import annotations

import pathlib
import warnings
from datetime import datetime

import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px

try:
    import community as community_louvain
except ImportError:
    community_louvain = None

try:
    from lifelines import CoxPHFitter, KaplanMeierFitter
except ImportError:
    KaplanMeierFitter = CoxPHFitter = None

try:
    from rapidfuzz.distance import Levenshtein
except ImportError:
    Levenshtein = None

from __future__ import annotations

import plotly.graph_objects as go

try:
    import community as community_louvain
except ImportError:
    community_louvain = None

try:
    from lifelines import CoxPHFitter, KaplanMeierFitter
except ImportError:
    KaplanMeierFitter = CoxPHFitter = None

try:
    from rapidfuzz.distance import Levenshtein
except ImportError:
    Levenshtein = None


try:
    import community as community_louvain
except ImportError:
    community_louvain = None


import altair as alt

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [3]:
ROOT = pathlib.Path(".").resolve().parent
GRAPHS_DIR = ROOT / "graphs"
PROC_DIR = ROOT / "processed"
FIG_DIR = ROOT / "figures" / "temporal"
HTML_DIR = FIG_DIR / "html"

FIG_DIR.mkdir(exist_ok=True)
HTML_DIR.mkdir(exist_ok=True)

PLOTLY_TEMPL = "plotly_white"

In [4]:
def load_graph(name: str) -> nx.Graph:
    path = GRAPHS_DIR / f"{name}.graphml"
    return nx.read_graphml(path)


def add_cluster_attribute(G: nx.Graph, resolution: float = 1.0, attr: str = "cluster"):
    if community_louvain is None:
        raise ImportError("python-louvain is not installed in this environment.")
    partition = community_louvain.best_partition(nx.Graph(G), resolution=resolution)
    nx.set_node_attributes(G, partition, attr)
    return partition


def load_hops() -> pd.DataFrame:
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet").sort_values(
        ["image_id", "unixtime"]
    )
    df["next_time"] = df.groupby("image_id").unixtime.shift(-1)
    df["next_sub"] = df.groupby("image_id").subreddit.shift(-1)
    hops = df.dropna(subset=["next_time"]).loc[df.subreddit != df.next_sub].copy()
    hops["gap_h"] = (hops.next_time - hops.unixtime) / 3600.0
    first_year = pd.to_datetime(
        df.groupby("image_id").unixtime.transform("first"), unit="s"
    ).dt.year
    hops["first_year"] = first_year
    hops["gap_idx"] = hops.groupby("image_id").cumcount() + 1
    hops["timestamp"] = pd.to_datetime(hops.next_time, unit="s")
    return hops


In [5]:
def speed_carpet(html_out: pathlib.Path = HTML_DIR / "speed_carpet.html"):
    G = load_graph("latency_flow")
    add_cluster_attribute(G)
    df = pd.DataFrame(
        [
            {
                "src": u,
                "dst": v,
                "speed": d["speed"],
                "src_cl": G.nodes[u]["cluster"],
                "dst_cl": G.nodes[v]["cluster"],
            }
            for u, v, d in G.edges(data=True)
        ]
    )
    df_piv = df.pivot(index="src", columns="dst", values="speed")
    fig = px.imshow(
        np.log10(df_piv.fillna(1e-6)),
        aspect="auto",
        color_continuous_scale="Turbo",
        labels=dict(color="log10(speed)"),
        template=PLOTLY_TEMPL,
        height=900,
        title="Edge speed carpet (log10)",
    )
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.write_html(html_out)
    print(f"✔ Speed carpet → {html_out}")


In [6]:
def rolling_median_gap_series(out=HTML_DIR / "rolling_gap.html", window="30D"):
    hops = load_hops().set_index("timestamp").sort_index()
    med = hops.gap_h.rolling(window).median().dropna()
    px.line(
        med,
        title=f"Rolling {window} median repost gap",
        template=PLOTLY_TEMPL,
        height=600,
    ).update_yaxes(type="log").write_html(out)


In [7]:
def monthly_resubmission_volume(out=HTML_DIR / "monthly_resubs.html"):
    hops = load_hops()
    hops["month"] = hops.timestamp.dt.to_period("M").dt.to_timestamp()
    vol = hops.groupby("month").size().reset_index(name="resubs")
    px.bar(
        vol,
        x="month",
        y="resubs",
        title="Monthly resubmission volume",
        template=PLOTLY_TEMPL,
        height=600,
    ).write_html(out)


In [8]:
def exporter_bar_race(
    html_out: pathlib.Path = HTML_DIR / "bar_race.html", *, period: str = "M"
):
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet")
    df = df.sort_values(["image_id", "unixtime"])
    df["gain"] = df.groupby("image_id")["score"].diff()
    df = df.dropna(subset=["gain"])
    df["period"] = (
        pd.to_datetime(df.unixtime, unit="s").dt.to_period(period).dt.to_timestamp()
    )

    gains = df.groupby(["period", "subreddit"])["gain"].mean().reset_index()
    gains["cum_gain"] = gains.groupby("subreddit")["gain"].cumsum()

    fig = px.bar(
        gains,
        x="cum_gain",
        y="subreddit",
        animation_frame="period",
        orientation="h",
        range_x=[0, gains.cum_gain.max() * 1.05],
        template=PLOTLY_TEMPL,
        height=700,
        title="Cumulative avg karma exported per subreddit",
    )
    fig.update_layout(
        yaxis={"categoryorder": "total ascending"},
        transition={"duration": 500},
        # frame={"duration": 500},
    )
    fig.write_html(html_out)
    print(f"✔ Bar‑race → {html_out}")


In [9]:
def edge_rank_bump_chart(
    html_out: pathlib.Path = HTML_DIR / "edge_bump.html",
    *,
    top_k: int = 50,
    min_years: int = 2,
    palette: list[str] | None = None,
):
    # Load & prepare data
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet").sort_values(
        ["image_id", "unixtime"]
    )
    df["gain"] = df.groupby("image_id")["score"].diff()
    df = df.dropna(subset=["gain"])
    df["year"] = pd.to_datetime(df.unixtime, unit="s").dt.year

    # Median gain by (year, subreddit)
    g = df.groupby(["year", "subreddit"])["gain"].median().reset_index()
    g["rank"] = g.groupby("year")["gain"].rank(method="min", ascending=False)

    # Filter by top_k and frequency
    eligible = g.loc[g["rank"] <= top_k, "subreddit"].value_counts()
    keep_subs = eligible[eligible >= min_years].index
    g = g[g["subreddit"].isin(keep_subs) & (g["rank"] <= top_k)]

    # Colour mapping
    if palette is None:
        palette = px.colors.qualitative.Plotly
    colour_map = {
        sub: palette[i % len(palette)] for i, sub in enumerate(sorted(keep_subs))
    }

    # Build figure
    fig = go.Figure()
    for sub in sorted(keep_subs):
        d = g[g["subreddit"] == sub].sort_values("year")
        fig.add_trace(
            go.Scatter(
                x=d["year"],
                y=d["rank"],
                mode="lines+markers",
                line=dict(width=2, color=colour_map[sub]),
                marker=dict(size=6),
                name=sub,
                hovertemplate=(
                    f"<b>{sub}</b>"  # the bit you want to keep
                    "<br>year %{x}: rank %{y}"
                    "<br>median gain %{customdata:+,.0f}"
                    "<extra></extra>"  # <-- suppress the duplicate on the right
                ),
                customdata=d["gain"].values,
            )
        )

    fig.update_yaxes(autorange="reversed", title="Rank (1 = highest median gain)")
    fig.update_xaxes(title="Year", dtick=1)
    fig.update_layout(
        title="Median‑gain rank evolution of leading subreddits",
        height=600,
        template="plotly_white",
        legend_title="Subreddit",
        margin=dict(l=60, r=20, t=60, b=40),
    )

    # Export
    fig.write_html(str(html_out), include_plotlyjs="cdn")
    print(f"✔ Bump chart → {html_out}")


In [10]:
def attention_span_trend(
    html_out: pathlib.Path = HTML_DIR / "attention_span.html",
):
    hops = load_hops()
    df = hops[["gap_h", "first_year"]].copy()
    df["log_gap"] = np.log10(df.gap_h)

    med = df.groupby("first_year")["log_gap"].median().reset_index(name="median_lg")

    trend = (
        alt.Chart(med)
        .mark_line(point=True, strokeWidth=3)
        .encode(
            x="first_year:O",
            y=alt.Y(
                "median_lg:Q",
                scale=alt.Scale(type="linear"),
                axis=alt.Axis(format="~s"),
            ),
            tooltip=["first_year", alt.Tooltip("median_lg", title="Median (log10 h)")],
        )
    )

    chart = trend.properties(
        width=550, height=400, title="Attention span shrinks 2008 → 2013"
    )
    chart.save(str(html_out))
    print(f"✔ Attention-span chart → {html_out}")


In [11]:
def gap_ccdf(
    html_out: pathlib.Path = HTML_DIR / "gap_ccdf.html",
    cutoff_h: float | None = None,
):
    G = load_graph("latency_flow")
    gaps = np.array([d["median_gap_h"] for _, _, d in G.edges(data=True)])
    x = np.sort(gaps)
    y = 1.0 - np.arange(1, len(x) + 1) / len(x)

    fig = go.Figure(
        go.Scatter(
            x=x,
            y=y,
            mode="lines+markers",
            marker=dict(size=3),
            name="CCDF",
        )
    )
    fig.update_layout(
        template=PLOTLY_TEMPL,
        xaxis_title="Median repost gap (h, log)",
        yaxis_title="P(X ≥ x)",
        xaxis_type="log",
        yaxis_type="log",
        height=500,
        title="Heavy-tail of repost latencies",
    )

    if cutoff_h is None:
        cutoff_h = np.quantile(gaps, 0.9)
    fig.add_vline(
        x=cutoff_h,
        line=dict(dash="dash"),
        annotation_text=f"90 % cut ≈ {cutoff_h:.0f} h",
    )

    fig.write_html(html_out)
    print(f"✔ CCDF → {html_out}")

In [12]:
def hour_heatmap_interactive(
    html_out: pathlib.Path = HTML_DIR / "hour_heatmap.html",
):
    hops = load_hops()
    hops["src_hr"] = pd.to_datetime(hops.unixtime, unit="s").dt.hour
    hops["dst_hr"] = hops.timestamp.dt.hour

    med = hops.pivot_table(
        index="src_hr", columns="dst_hr", values="gap_h", aggfunc="median"
    )
    cnt = (
        hops.pivot_table(
            index="src_hr", columns="dst_hr", values="gap_h", aggfunc="size"
        )
        .fillna(0)
        .astype(int)
    )

    z = np.log10(med + 1e-3)
    fig = px.imshow(
        z,
        aspect="auto",
        labels=dict(color="log10(median h)"),
        x=list(med.columns),
        y=list(med.index),
        template=PLOTLY_TEMPL,
        title="When to repost: source-hour × destination-hour",
        height=700,
        color_continuous_scale="Viridis",
    )
    fig.update_traces(
        customdata=cnt.values,
        hovertemplate="src %{y} h → dst %{x} h<br>median=%{z:.2f} log10 h<br>N=%{customdata} hops<extra></extra>",
    )
    fig.write_html(html_out)
    print(f"✔ Hour heat-map → {html_out}")


In [13]:
def survival_interactive(
    html_out: pathlib.Path = HTML_DIR / "survival_interactive.html",
):
    if KaplanMeierFitter is None:
        print("ℹ️ lifelines not installed – interactive survival skipped")
        return

    hops = load_hops()

    kmf = KaplanMeierFitter()
    traces, buttons = [], []
    visible_mask = []

    def add_trace(label, x, y, group_type):
        idx = len(visible_mask)
        traces.append(
            go.Scatter(
                x=x,
                y=y,
                mode="lines",
                name=label,
                visible=False,
                hovertemplate=f"{label}<br>t=%{{x:.1f}} h<br>survival=%{{y:.2f}}",
            )
        )
        visible_mask.append((group_type, idx))

    for yr, grp in hops.groupby("first_year"):
        kmf.fit(grp.gap_h, event_observed=np.ones(len(grp)))
        add_trace(
            str(int(yr)),
            kmf.survival_function_.index,
            kmf.survival_function_["KM_estimate"],
            "cohort",
        )

    G = load_graph("latency_flow")
    q1, q2 = np.quantile([d["speed"] for _, _, d in G.edges(data=True)], [1 / 3, 2 / 3])
    cls_lookup = {
        (u, v): ("slow" if s <= q1 else "fast" if s >= q2 else "mid")
        for u, v, d in G.edges(data=True)
        for s in [d["speed"]]
    }
    hops["edge"] = list(zip(hops.subreddit, hops.next_sub))
    hops["speed_class"] = hops.edge.map(cls_lookup)

    for spd, grp in hops.dropna(subset=["speed_class"]).groupby("speed_class"):
        kmf.fit(grp.gap_h, event_observed=np.ones(len(grp)))
        add_trace(
            spd,
            kmf.survival_function_.index,
            kmf.survival_function_["KM_estimate"],
            "speed",
        )

    init_vis = ["cohort", "speed"]
    menus = []
    for gtype in init_vis:
        mask = [vis[0] == gtype for vis in visible_mask]
        menus.append(
            dict(
                label=f"{gtype.capitalize()} groups",
                method="update",
                args=[{"visible": mask}, {"title": f"Kaplan–Meier — {gtype}"}],
            )
        )

    fig = go.Figure(traces)
    fig.update_layout(
        template=PLOTLY_TEMPL,
        yaxis_title="Survival probability",
        xaxis_title="Hours (log)",
        xaxis_type="log",
        updatemenus=[
            dict(
                type="dropdown",
                x=1.02,
                y=0.9,
                buttons=menus,
            )
        ],
        title="Kaplan–Meier — cohort",
        height=500,
    )
    for tr, vis in zip(fig.data, visible_mask):
        tr.visible = vis[0] == "cohort"

    fig.write_html(html_out, include_plotlyjs="cdn")
    print(f"✔ Interactive KM → {html_out}")


In [14]:
def cox_coeff_bar_ci(
    html_out: pathlib.Path = HTML_DIR / "cox_coeffs_ci.html",
):
    if CoxPHFitter is None or Levenshtein is None:
        print("ℹ️ lifelines/rapidfuzz missing – Cox bar skipped")
        return

    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet").sort_values(
        ["image_id", "unixtime"]
    )
    df["next_time"] = df.groupby("image_id").unixtime.shift(-1)
    df["next_title"] = df.groupby("image_id").title.shift(-1)
    df["next_sub"] = df.groupby("image_id").subreddit.shift(-1)

    hops = df.dropna(subset=["next_time"]).loc[df.subreddit != df.next_sub]
    hops["duration"] = (hops.next_time - hops.unixtime) / 3600.0
    hops["event"] = 1

    G = load_graph("latency_flow")
    speed_lu = {(u, v): d["speed"] for u, v, d in G.edges(data=True)}
    hops["speed"] = list(
        map(lambda p: speed_lu.get(p, np.nan), zip(hops.subreddit, hops.next_sub))
    )
    hops = hops.dropna(subset=["speed"])

    hops["title_dist"] = hops.apply(
        lambda r: Levenshtein.distance(r.title, r.next_title), axis=1
    )
    hops["init_score"] = hops.score

    cph = CoxPHFitter()
    cph.fit(
        hops[["duration", "event", "speed", "title_dist", "init_score"]],
        duration_col="duration",
        event_col="event",
    )

    coef = cph.params_.rename("coef").to_frame()
    ci = cph.confidence_intervals_
    coef["low"] = ci.iloc[:, 0]
    coef["high"] = ci.iloc[:, 1]
    coef = coef.sort_values("coef")

    fig = go.Figure(
        go.Bar(
            x=coef["coef"],
            y=coef.index,
            orientation="h",
            error_x=dict(
                type="data",
                symmetric=False,
                array=coef["high"] - coef["coef"],
                arrayminus=coef["coef"] - coef["low"],
            ),
            hovertemplate="%{y}<br>coef=%{x:.4f}<br>CI=[%{customdata[0]:.4f}, %{customdata[1]:.4f}]<extra></extra>",
            customdata=np.stack([coef["low"], coef["high"]], axis=-1),
        )
    )
    fig.update_layout(
        template=PLOTLY_TEMPL,
        xaxis_title="Coefficient (log-HR)",
        yaxis_title="Covariate",
        height=400,
        title="Cox model: which factors hasten resubmission?",
    )
    fig.write_html(html_out)
    print(f"✔ Cox bar → {html_out}")


In [15]:
ALL_FUNCS = [
    speed_carpet,
    rolling_median_gap_series,
    monthly_resubmission_volume,
    exporter_bar_race,
    edge_rank_bump_chart,
    attention_span_trend,
    survival_interactive,
    gap_ccdf,
    hour_heatmap_interactive,
    cox_coeff_bar_ci,
]


def run_all():
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        start = datetime.now()
        for fn in ALL_FUNCS:
            fname = fn.__name__
            try:
                print(f"→ {fname}()")
                if fname == "sankey_top_images":
                    fn(n_images=10)
                else:
                    fn()
            except Exception as e:
                print(f"⚠ {fname} failed: {e}")
        print(
            f"Completed in {datetime.now() - start} – outputs in {FIG_DIR} & {HTML_DIR}"
        )


In [16]:
run_all()

→ speed_carpet()
✔ Speed carpet → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\speed_carpet.html
→ rolling_median_gap_series()
→ monthly_resubmission_volume()
→ exporter_bar_race()
✔ Bar‑race → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\bar_race.html
→ edge_rank_bump_chart()
✔ Bump chart → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\edge_bump.html
→ attention_span_trend()
✔ Attention-span chart → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\attention_span.html
→ survival_interactive()
✔ Interactive KM → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\survival_interactive.html
→ gap_ccdf()
✔ CCDF → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\gap_ccdf.html
→ hour_heatmap_interactive()
✔ Hour heat-map → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\hour_heatmap.html
→ cox_coeff_bar_ci()
✔ Cox bar