In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import annotations

import pathlib
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

try:
    import community as community_louvain
except ImportError:
    community_louvain = None

try:
    from lifelines import CoxPHFitter, KaplanMeierFitter
except ImportError:
    KaplanMeierFitter = CoxPHFitter = None

try:
    from rapidfuzz.distance import Levenshtein
except ImportError:
    Levenshtein = None

In [3]:
ROOT = pathlib.Path(".").resolve().parent
GRAPHS_DIR = ROOT / "graphs"
PROC_DIR = ROOT / "processed"
FIG_DIR = ROOT / "figures" / "temporal"
HTML_DIR = FIG_DIR / "html"

FIG_DIR.mkdir(exist_ok=True)
HTML_DIR.mkdir(exist_ok=True)

PLOTLY_TEMPL = "plotly_white"

In [4]:
def load_graph(name: str) -> nx.Graph:
    """Load a GraphML file and return a NetworkX graph."""
    path = GRAPHS_DIR / f"{name}.graphml"
    return nx.read_graphml(path)


def add_cluster_attribute(G: nx.Graph, resolution: float = 1.0, attr: str = "cluster"):
    """Add Louvain community IDs as a node attribute."""
    if community_louvain is None:
        raise ImportError("python-louvain is not installed in this environment.")
    partition = community_louvain.best_partition(nx.Graph(G), resolution=resolution)
    nx.set_node_attributes(G, partition, attr)
    return partition


def load_hops() -> pd.DataFrame:
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet").sort_values(
        ["image_id", "unixtime"]
    )
    df["next_time"] = df.groupby("image_id").unixtime.shift(-1)
    df["next_sub"] = df.groupby("image_id").subreddit.shift(-1)
    hops = df.dropna(subset=["next_time"]).loc[df.subreddit != df.next_sub].copy()
    hops["gap_h"] = (hops.next_time - hops.unixtime) / 3600.0
    first_year = pd.to_datetime(
        df.groupby("image_id").unixtime.transform("first"), unit="s"
    ).dt.year
    hops["first_year"] = first_year
    hops["gap_idx"] = hops.groupby("image_id").cumcount() + 1
    hops["timestamp"] = pd.to_datetime(hops.next_time, unit="s")
    return hops


In [5]:
def latency_hist(plot_out: pathlib.Path = FIG_DIR / "latency_hist.png"):
    G = load_graph("latency_flow")
    gaps = [d["median_gap_h"] for _, _, d in G.edges(data=True)]
    plt.figure(figsize=(6, 4))
    sns.histplot(gaps, bins=60, log_scale=(False, True))
    plt.xlabel("Median repost gap (h)")
    plt.ylabel("# edges (log)")
    plt.title("Distribution of median repost gaps")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ Latency hist → {plot_out}")


def speed_carpet(html_out: pathlib.Path = HTML_DIR / "speed_carpet.html"):
    G = load_graph("latency_flow")
    add_cluster_attribute(G)
    df = pd.DataFrame(
        [
            {
                "src": u,
                "dst": v,
                "speed": d["speed"],
                "src_cl": G.nodes[u]["cluster"],
                "dst_cl": G.nodes[v]["cluster"],
            }
            for u, v, d in G.edges(data=True)
        ]
    )
    df_piv = df.pivot(index="src", columns="dst", values="speed")
    fig = px.imshow(
        np.log10(df_piv.fillna(1e-6)),
        aspect="auto",
        color_continuous_scale="Turbo",
        labels=dict(color="log10(speed)"),
        template=PLOTLY_TEMPL,
        height=900,
        title="Edge speed carpet (log10)",
    )
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.write_html(html_out)
    print(f"✔ Speed carpet → {html_out}")


def violin_latency_hubs(
    plot_out: pathlib.Path = FIG_DIR / "violin_latency_hubs.png", top_n: int = 10
):
    G = load_graph("latency_flow")
    hubs = sorted(G.in_degree(weight="n_hops"), key=lambda kv: kv[1], reverse=True)[
        :top_n
    ]
    hub_set = {n for n, _ in hubs}
    data = [
        {"hub": v, "gap": d["median_gap_h"]}
        for u, v, d in G.edges(data=True)
        if v in hub_set
    ]
    df = pd.DataFrame(data)
    plt.figure(figsize=(8, 4))
    sns.violinplot(data=df, x="hub", y="gap", scale="width", inner="quartile")
    plt.yscale("log")
    plt.xticks(rotation=45, ha="right")
    plt.title("Median gap distribution for revival hubs")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ Violin → {plot_out}")


def survival_curves_speed_tercile(
    plot_out: pathlib.Path = FIG_DIR / "survival_speed.png",
):
    if KaplanMeierFitter is None:
        print("⚠ lifelines not installed – survival curves skipped")
        return
    G = load_graph("latency_flow")
    speed_vals = [d["speed"] for _, _, d in G.edges(data=True)]
    q1, q2 = np.quantile(speed_vals, [1 / 3, 2 / 3])
    speed_cls = {
        (u, v): ("slow" if s <= q1 else "fast" if s >= q2 else "mid")
        for u, v, s in [(u, v, d["speed"]) for u, v, d in G.edges(data=True)]
    }

    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet")
    df = df.sort_values(["image_id", "unixtime"])
    df["next_time"] = df.groupby("image_id")["unixtime"].shift(-1)
    df["next_sub"] = df.groupby("image_id")["subreddit"].shift(-1)
    hops = df.dropna(subset=["next_time"]).loc[df.subreddit != df.next_sub]
    hops["gap_h"] = (hops.next_time - hops.unixtime) / 3600.0
    hops["edge"] = list(zip(hops.subreddit, hops.next_sub))
    hops["speed_class"] = hops.edge.map(speed_cls)
    hops = hops.dropna(subset=["speed_class"])  # drop hops not in latency graph

    kmf = KaplanMeierFitter()
    plt.figure(figsize=(6, 4))
    for grp, dfg in hops.groupby("speed_class"):
        kmf.fit(durations=dfg.gap_h, event_observed=np.ones(len(dfg)), label=grp)
        kmf.plot(ci_show=False)
    plt.xscale("log")
    plt.xlabel("Hours (log)")
    plt.ylabel("Survival probability")
    plt.title("Kaplan–Meier survival by edge‑speed tercile")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ Survival curves → {plot_out}")


def cox_hazard_model(
    plot_out: pathlib.Path = FIG_DIR / "cox_coeffs.png",
    table_out: pathlib.Path = FIG_DIR / "cox_summary.csv",
):
    if CoxPHFitter is None or Levenshtein is None:
        print("⚠ lifelines or rapidfuzz missing – Cox model skipped")
        return
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet")
    df = df.sort_values(["image_id", "unixtime"])
    df["next_time"] = df.groupby("image_id")["unixtime"].shift(-1)
    df["next_score"] = df.groupby("image_id")["score"].shift(-1)
    df["next_title"] = df.groupby("image_id")["title"].shift(-1)
    df["next_sub"] = df.groupby("image_id")["subreddit"].shift(-1)
    hops = df.dropna(subset=["next_time"]).loc[df.subreddit != df.next_sub]
    hops["duration"] = (hops.next_time - hops.unixtime) / 3600.0
    hops["event"] = 1  # always observed
    # covariates
    speed_lookup = {
        (u, v): d["speed"] for u, v, d in load_graph("latency_flow").edges(data=True)
    }
    hops["speed"] = list(
        map(
            lambda x: speed_lookup.get((x[0], x[1]), np.nan),
            zip(hops.subreddit, hops.next_sub),
        )
    )
    hops = hops.dropna(subset=["speed"])
    hops["title_dist"] = hops.apply(
        lambda r: Levenshtein.distance(r.title, r.next_title), axis=1
    )
    hops["init_score"] = hops.score
    cox_df = hops[["duration", "event", "speed", "title_dist", "init_score"]].copy()
    cph = CoxPHFitter()
    cph.fit(cox_df, duration_col="duration", event_col="event")
    coef = cph.params_
    coef.to_csv(table_out)
    # barplot
    plt.figure(figsize=(5, 3))
    coef.sort_values().plot(kind="barh")
    plt.xlabel("Coefficient (log‑HR)")
    plt.title("Cox model coefficients")
    plt.tight_layout()
    plt.savefig(plot_out, dpi=150)
    plt.close()
    print(f"✔ Cox model → {plot_out}  (coeff table → {table_out})")


def half_life_trend_line(out=FIG_DIR / "half_life_trend.png"):
    hops = load_hops()
    mid = hops.groupby("image_id").gap_h.median().reset_index(name="med")
    yrs = hops.groupby("image_id").first_year.first().reset_index()
    df = mid.merge(yrs)
    trend = df.groupby("first_year").med.median().reset_index()
    plt.figure(figsize=(6, 4))
    sns.lineplot(data=trend, x="first_year", y="med", marker="o")
    plt.yscale("log")
    plt.xlabel("First‑year cohort")
    plt.ylabel("Median gap (h, log)")
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()


def gap_index_boxplot(out=FIG_DIR / "gap_index_boxplot.png", mx=5):
    hops = load_hops()
    flt = hops[hops.gap_idx <= mx]
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=flt, x="gap_idx", y="gap_h")
    plt.yscale("log")
    plt.xlabel("Repost index")
    plt.ylabel("Gap (h, log)")
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()


def survival_curves_cohort_year(out=FIG_DIR / "survival_cohort.png"):
    if KaplanMeierFitter is None:
        print("ℹ️  lifelines not installed – survival by cohort skipped")
        return
    hops = load_hops()
    km = KaplanMeierFitter()
    plt.figure(figsize=(6, 4))
    for y, dfy in hops.groupby("first_year"):
        km.fit(durations=dfy.gap_h, event_observed=np.ones(len(dfy)), label=str(int(y)))
        km.plot(ci_show=False)
    plt.xscale("log")
    plt.xlabel("Hours (log)")
    plt.ylabel("Survival")
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()


def rolling_median_gap_series(out=HTML_DIR / "rolling_gap.html", window="30D"):
    hops = load_hops().set_index("timestamp").sort_index()
    med = hops.gap_h.rolling(window).median().dropna()
    px.line(
        med,
        title=f"Rolling {window} median repost gap",
        template=PLOTLY_TEMPL,
        height=600,
    ).update_yaxes(type="log").write_html(out)


def hourly_gap_heatmap(out=FIG_DIR / "hourly_gap_heatmap.png"):
    hops = load_hops()
    hops["src_hr"] = pd.to_datetime(hops.unixtime, unit="s").dt.hour
    hops["dst_hr"] = hops.timestamp.dt.hour
    heat = hops.pivot_table(
        values="gap_h", index="src_hr", columns="dst_hr", aggfunc="median"
    )
    plt.figure(figsize=(7, 6))
    sns.heatmap(
        np.log10(heat + 1e-3), cmap="viridis", cbar_kws={"label": "log10(median gap h)"}
    )
    plt.xlabel("Dest post hour")
    plt.ylabel("Source post hour")
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()


def monthly_resubmission_volume(out=HTML_DIR / "monthly_resubs.html"):
    hops = load_hops()
    hops["month"] = hops.timestamp.dt.to_period("M").dt.to_timestamp()
    vol = hops.groupby("month").size().reset_index(name="resubs")
    px.bar(
        vol,
        x="month",
        y="resubs",
        title="Monthly resubmission volume",
        template=PLOTLY_TEMPL,
        height=600,
    ).write_html(out)


def gap_distribution_by_year_violin(out=FIG_DIR / "gap_year_violin.png"):
    hops = load_hops()
    hops["year"] = hops.timestamp.dt.year
    plt.figure(figsize=(8, 4))
    sns.violinplot(data=hops, x="year", y="gap_h", scale="width", inner="quartile")
    plt.yscale("log")
    plt.xlabel("Year")
    plt.ylabel("Gap (h, log)")
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()

In [6]:
def exporter_bar_race(
    html_out: pathlib.Path = HTML_DIR / "bar_race.html", *, period: str = "M"
):
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet")
    df = df.sort_values(["image_id", "unixtime"])
    df["gain"] = df.groupby("image_id")["score"].diff()
    df = df.dropna(subset=["gain"])
    df["period"] = (
        pd.to_datetime(df.unixtime, unit="s").dt.to_period(period).dt.to_timestamp()
    )

    gains = df.groupby(["period", "subreddit"])["gain"].mean().reset_index()
    gains["cum_gain"] = gains.groupby("subreddit")["gain"].cumsum()

    fig = px.bar(
        gains,
        x="cum_gain",
        y="subreddit",
        animation_frame="period",
        orientation="h",
        range_x=[0, gains.cum_gain.max() * 1.05],
        template=PLOTLY_TEMPL,
        height=700,
        title="Cumulative avg karma exported per subreddit",
    )
    fig.update_layout(
        yaxis={"categoryorder": "total ascending"},
        transition={"duration": 500},
        frame={"duration": 500},
    )
    fig.write_html(html_out)
    print(f"✔ Bar‑race → {html_out}")


In [7]:
def edge_rank_bump_chart(
    html_out: pathlib.Path = HTML_DIR / "edge_bump.html",
    *,
    top_k: int = 50,
    min_years: int = 2,
    palette: list[str] | None = None,
):
    # Load & prepare data
    df = pd.read_parquet(PROC_DIR / "submissions_final.parquet").sort_values(
        ["image_id", "unixtime"]
    )
    df["gain"] = df.groupby("image_id")["score"].diff()
    df = df.dropna(subset=["gain"])
    df["year"] = pd.to_datetime(df.unixtime, unit="s").dt.year

    # Median gain by (year, subreddit)
    g = df.groupby(["year", "subreddit"])["gain"].median().reset_index()
    g["rank"] = g.groupby("year")["gain"].rank(method="min", ascending=False)

    # Filter by top_k and frequency
    eligible = g.loc[g["rank"] <= top_k, "subreddit"].value_counts()
    keep_subs = eligible[eligible >= min_years].index
    g = g[g["subreddit"].isin(keep_subs) & (g["rank"] <= top_k)]

    # Colour mapping
    if palette is None:
        palette = px.colors.qualitative.Plotly
    colour_map = {
        sub: palette[i % len(palette)] for i, sub in enumerate(sorted(keep_subs))
    }

    # Build figure
    fig = go.Figure()
    for sub in sorted(keep_subs):
        d = g[g["subreddit"] == sub].sort_values("year")
        fig.add_trace(
            go.Scatter(
                x=d["year"],
                y=d["rank"],
                mode="lines+markers",
                line=dict(width=2, color=colour_map[sub]),
                marker=dict(size=6),
                name=sub,
                hovertemplate=(
                    f"<b>{sub}</b>"  # the bit you want to keep
                    "<br>year %{x}: rank %{y}"
                    "<br>median gain %{customdata:+,.0f}"
                    "<extra></extra>"  # <-- suppress the duplicate on the right
                ),
                customdata=d["gain"].values,
            )
        )

    fig.update_yaxes(autorange="reversed", title="Rank (1 = highest median gain)")
    fig.update_xaxes(title="Year", dtick=1)
    fig.update_layout(
        title="Median‑gain rank evolution of leading subreddits",
        height=600,
        template="plotly_white",
        legend_title="Subreddit",
        margin=dict(l=60, r=20, t=60, b=40),
    )

    # Export
    fig.write_html(str(html_out), include_plotlyjs="cdn")
    print(f"✔ Bump chart → {html_out}")


In [8]:
ALL_FUNCS = [
    latency_hist,
    speed_carpet,
    violin_latency_hubs,
    survival_curves_speed_tercile,
    cox_hazard_model,
    half_life_trend_line,
    gap_index_boxplot,
    survival_curves_cohort_year,
    rolling_median_gap_series,
    hourly_gap_heatmap,
    monthly_resubmission_volume,
    gap_distribution_by_year_violin,
    exporter_bar_race,
    edge_rank_bump_chart,
]


def run_all():
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        start = datetime.now()
        for fn in ALL_FUNCS:
            fname = fn.__name__
            try:
                print(f"→ {fname}()")
                if fname == "sankey_top_images":
                    fn(n_images=10)
                else:
                    fn()
            except Exception as e:
                print(f"⚠ {fname} failed: {e}")
        print(
            f"Completed in {datetime.now() - start} – outputs in {FIG_DIR} & {HTML_DIR}"
        )


In [9]:
run_all()

→ latency_hist()
✔ Latency hist → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\latency_hist.png
→ speed_carpet()
✔ Speed carpet → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\html\speed_carpet.html
→ violin_latency_hubs()
✔ Violin → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\violin_latency_hubs.png
→ survival_curves_speed_tercile()
✔ Survival curves → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\survival_speed.png
→ cox_hazard_model()
✔ Cox model → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\cox_coeffs.png  (coeff table → C:\Users\balsr\OneDrive\Desktop\RedditDataAnalysis\figures\temporal\cox_summary.csv)
→ half_life_trend_line()
→ gap_index_boxplot()
→ survival_curves_cohort_year()
→ rolling_median_gap_series()
→ hourly_gap_heatmap()
→ monthly_resubmission_volume()
→ gap_distribution_by_year_violin()
→ exporter_bar_race()
⚠ exporter_bar_race failed: Invalid property sp