In [13]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'📂 Current working directory: {path}')
print(f'💚 Today is {today}')
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))

import graph
import taxonomy
import data_loads
import figure_build

# Plotting
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

📂 Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
💚 Today is 2025-12-23


In [2]:
period_order = taxonomy.period_order
period_labels = taxonomy.period_labels
cm_to_px = graph.cm_to_px

# Global Setting

In [3]:
# Visualization settings ----------------------------------
template_type = "simple_white"
showlegend = False
dotsize = 4 # 20 when exporting it to html
other_opacity = 0.3

# Define color palette ---------------------------------------------------------------------
node_palette = {"R4": "#008A69",
              "M1": "#1964B0",
              "R2": "#E9DC6D",
              "D13":"#DB5829",
              "D1": "#882D71",
            #   "Other": "#DEDEDE"
            "Other": "#808080"
             }

pair_palette = {
    "M1-R4": "#008A69",
    "M1-R2": "#1964B0",
    "M1-D1": "#E9DC6D",
    "D1-R4": "#DB5829",
    "D1-R2": "#882D71",
    # "Other": "#DEDEDE"
    "Other": "#808080"
}

triangle_palette = {
    "D1-M1-R2": "#008A69",
    "D1-M1-R4": "#1964B0",
    "D6-R4-M1": "#E9DC6D",
    "D3-M1-R2": "#DB5829",
    "D3-M1-R4": "#882D71",
    # "Other": "#DEDEDE"
    "Other": "#808080"
}

In [4]:
# Define point offsets for jittering ---------------------------------------------
point_offsets_nodes = {
    "R4": -1,
    "M1": -0.5,
    "R2": 0.0,
    "D13": 0.5,
    "D1": 1
}
point_offsets_pairs= {
    "M1-R4": -1,
    "M1-R2": -0.5,
    "M1-D1": 0.0,
    "D1-R4": 0.5,
    "D1-R2": 1,
}

point_offsets_triplet = {
    "D1-M1-R2": -1,
    "D1-M1-R4": -0.5,
    "D6-R4-M1": 0.0,
    "D3-M1-R4": 0.5,
    "D3-M1-R2": 1,
}


# Fig 2. Descriptives

In [5]:
import plotly.graph_objects as go


# Load Datasets--------------------------------------------
# Just in case it is modified later :-)
deg_df = data_loads.all_top_degree_df.copy()
pair_df = data_loads.pairs.copy()
tri_df = data_loads.tri.copy()
strength_df = data_loads.all_top_strength_df.copy()


# Take only one paper per period for degree and strength (to avoid overplotting)
deg_df = deg_df.drop_duplicates(subset=['short_name', 'period'])
pair_df = pair_df.drop_duplicates(subset=['short_name', 'period'])
tri_df = tri_df.drop_duplicates(subset=['short_name', 'period'])


deg_df["period"] = pd.Categorical(deg_df["period"], categories=period_order, ordered=True)


# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=3, column_widths=[0.33, 0.33, 0.33], horizontal_spacing=0.1,)


# --- "Other" nodes first (no jitter) ---
mask_other = strength_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=strength_df.loc[mask_other, "period"],
    y=strength_df.loc[mask_other, "strength"] / 2,
    boxpoints='all',
    jitter=0.3,
    pointpos=0,
    whiskerwidth=0,
    line=dict(width=0),
    fillcolor='rgba(0,0,0,0)',
    marker=dict(color=node_palette["Other"], 
                opacity=other_opacity, size=dotsize),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Node: %{customdata[0]}<br>"
        "Kind: %{customdata[1]}<br>"
        "Strength: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        strength_df.loc[mask_other, "short_name"],
        strength_df.loc[mask_other, "kind"],
        strength_df.loc[mask_other, "strength"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# --- Highlighted groups with jitter ---

for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = strength_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=strength_df.loc[mask, "period"],
        y=strength_df.loc[mask, "strength"] / 2,
        boxpoints='all',
        jitter=0.7,          # horizontal jitter
        pointpos=point_offsets_nodes[label],          # centered at category
        whiskerwidth=0,      # no whiskers
        line=dict(width=0),  # no box outline
        fillcolor='rgba(0,0,0,0)',  # transparent background
        marker=dict(
            color=node_palette[label],
            opacity=1.0,
            size=dotsize
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Kind: %{customdata[1]}<br>"
            "Strength: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            strength_df.loc[mask, "short_name"],
            strength_df.loc[mask, "kind"],
            strength_df.loc[mask, "strength"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# Pairs

# --- "Other" pairs ---
mask_other = pair_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=pair_df.loc[mask_other, "period"],
    y=pair_df.loc[mask_other, "weight_sum"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=pair_palette["Other"],
        opacity=other_opacity,
        size=dotsize,
        symbol='pentagon'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Pair: %{customdata[0]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        pair_df.loc[mask_other, "short_name"],
        pair_df.loc[mask_other, "weight_sum"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# --- Highlighted pairs (jittered) ---
for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = pair_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=pair_df.loc[mask, "period"],
        y=pair_df.loc[mask, "weight_sum"],
        boxpoints='all',
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=pair_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='pentagon'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Pair: %{customdata[0]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            pair_df.loc[mask, "short_name"],
            pair_df.loc[mask, "weight_sum"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)


# Triangles
# --- "Other" triangles ---
mask_other = tri_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=tri_df.loc[mask_other, "period"],
    y=tri_df.loc[mask_other, "n_papers"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=triangle_palette["Other"],
        opacity=other_opacity,
        size=dotsize,
        symbol='triangle-up'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Triangle: %{customdata[0]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        tri_df.loc[mask_other, "short_name"],
        tri_df.loc[mask_other, "n_papers"]
    ), axis=-1),
    showlegend=True
), row=1, col=3)

# --- Highlighted triangles (jittered) ---

for label in sorted(taxonomy.TRIANGLE_HIGHLIGHTS.values()):
    mask = tri_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=tri_df.loc[mask, "period"],
        y=tri_df.loc[mask, "n_papers"],
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_triplet[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=triangle_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='triangle-up'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Triangle: %{customdata[0]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            tri_df.loc[mask, "short_name"],
            tri_df.loc[mask, "n_papers"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=3)



strength_kendall_results = figure_build.add_kendall_overlay(
    subplots, strength_df, value_col="strength", period_col="period",
    row=1, col=1, period_order=period_order, overlay_idx=10, show_overlay_x=True, show_overlay_y=False
)

pair_kendall_results = figure_build.add_kendall_overlay(
    subplots, pair_df, value_col="weight_sum", period_col="period",
    row=1, col=2, period_order=period_order, overlay_idx=11, show_overlay_x=True, show_overlay_y=False
)

triangle_kendall_results = figure_build.add_kendall_overlay(
    subplots, tri_df, value_col="n_papers", period_col="period",
    row=1, col=3, period_order=period_order, overlay_idx=12, show_overlay_x=True, show_overlay_y=True
)


subplots.update_xaxes(categoryorder="array", categoryarray=period_order,   tickvals=list(range(len(period_order))),ticktext=[period_labels[p] for p in period_order], row=1)

subplots.update_yaxes(row=1, col=1, title_text="Node Count", title_standoff=1)
subplots.update_yaxes(row=1, col=2, title_text="Pair Count", title_standoff=1)
subplots.update_yaxes(row=1, col=3, title_text="Triangle Count", title_standoff=1)

subplots.add_annotation(
    text="(a)",
    xref="paper", yref="paper",
    x=-0.03, y=1.02,
    showarrow=False,
    font=dict(size=8, color="black"),
    xanchor="left",
    align="left"
)
subplots.add_annotation(
    text="(b)",
    xref="paper", yref="paper",
    x=0.334433, y=1.02,
    showarrow=False,
    font=dict(size=8, color="black"),
    xanchor="left",
    align="left"
)
subplots.add_annotation(
    text="(c)",
    xref="paper", yref="paper",
    x=0.7000009, y=1.02,
    showarrow=False,
    font=dict(size=8, color="black"),
    xanchor="left",
    align="left"
)

subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':10,'b':2,'r':10}
)


In [7]:
# subplots.write_html(f"{path}/results/feature-only-KG/img/fig2.html",
#                     include_plotlyjs='cdn',
#                     full_html=True,
#                     auto_open=False)


# subplots.write_image(f"{path}/results/feature-only-KG/img/fig2_2_jitter_opacity{other_opacity}.pdf",
#                      width=cm_to_px(18), height=cm_to_px(6))
subplots.write_image(f"{path}/results/feature-only-KG/img/fig2_2_jitter.svg",
                     width=cm_to_px(18), height=cm_to_px(6))

# Fig S4. Random Model - Descriptives

In [6]:
rn_node_df = pd.read_csv(f"{path}/results/feature-only-KG/random_graph_null_node_centrality_count_summary_two-sided_1000.csv")
rn_pair_df = pd.read_csv(f"{path}/results/feature-only-KG/random_graph_null_edge_centrality_count_summary_two-sided_1000.csv")
rn_tri_df = pd.read_csv(f"{path}/results/feature-only-KG/random_graph_null_triangle_count_summary_two-sided_1000.csv")
rn_summary = pd.read_csv(f"{path}/results/feature-only-KG/random_graph_null_kendall_tau_summary_two-sided_1000_CI.csv")

In [7]:
import plotly.graph_objects as go
import plotly.subplots as sp

# Load Datasets--------------------------------------------
rn_node_df = rn_node_df.copy()
rn_pair_df = rn_pair_df.copy()
rn_tri_df = rn_tri_df.copy()


rn_node_df["node"] = rn_node_df["node_name"].apply(graph.norm)
rn_pair_df["from_name"] = rn_pair_df["u_name"].apply(graph.norm)
rn_pair_df["to_name"] = rn_pair_df["v_name"].apply(graph.norm)
rn_tri_df["n1_name"] = rn_tri_df["n1_name"].apply(graph.norm)
rn_tri_df["n2_name"] = rn_tri_df["n2_name"].apply(graph.norm)
rn_tri_df["n3_name"] = rn_tri_df["n3_name"].apply(graph.norm)
rn_pair_df["names"] = (rn_pair_df["from_name"] + "-" + rn_pair_df["to_name"]).apply(graph.norm)
rn_tri_df["names"] = ("[" + rn_tri_df["n1_name"] + ", " + rn_tri_df["n2_name"] + ", " + rn_tri_df["n3_name"] + "]").apply(graph.norm)

rn_node_df["color_group"] = rn_node_df["node"].map(taxonomy.NODE_HIGHLIGHTS).fillna("Other")
rn_pair_df["color_group"] = rn_pair_df["names"].map(taxonomy.PAIR_HIGHLIGHTS).fillna("Other")
rn_tri_df["color_group"] = rn_tri_df["names"].map(taxonomy.TRIANGLE_HIGHLIGHTS).fillna("Other")


# Take only one paper per period for degree and strength (to avoid overplotting)
rn_node_df = rn_node_df.drop_duplicates(subset=['node', 'period'])
rn_pair_df = rn_pair_df.drop_duplicates(subset=['names', 'period'])
rn_tri_df = rn_tri_df.drop_duplicates(subset=['names', 'period'])

rn_node_df["period"] = pd.Categorical(rn_node_df["period"], categories=period_order, ordered=True)


# Normalization --------------------------------------------------
max_node_count = rn_node_df["mean_null_node_count"].max()
rn_node_df["mean_null_node_count"] = rn_node_df["mean_null_node_count"] / max_node_count


# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=3, column_widths=[0.33, 0.33, 0.33], horizontal_spacing=0.1,)


# --- "Other" nodes first (no jitter) ---
mask_other = rn_node_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=rn_node_df.loc[mask_other, "period"],
    y=rn_node_df.loc[mask_other, "mean_null_node_count"],
    boxpoints='all',
    jitter=0.3,
    pointpos=0,
    whiskerwidth=0,
    line=dict(width=0),
    fillcolor='rgba(0,0,0,0)',
    marker=dict(color=node_palette["Other"], 
                opacity=other_opacity, size=dotsize),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Node: %{customdata[0]}<br>"
        "Kind: %{customdata[1]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        rn_node_df.loc[mask_other, "node"],
        rn_node_df.loc[mask_other, "node_type"],
        rn_node_df.loc[mask_other, "mean_null_node_count"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# --- Highlighted groups with jitter ---

for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = rn_node_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=rn_node_df.loc[mask, "period"],
        y=rn_node_df.loc[mask, "mean_null_node_count"],
        boxpoints='all',
        jitter=0.7,          # horizontal jitter
        pointpos=point_offsets_nodes[label],          # centered at category
        whiskerwidth=0,      # no whiskers
        line=dict(width=0),  # no box outline
        fillcolor='rgba(0,0,0,0)',  # transparent background
        marker=dict(
            color=node_palette[label],
            opacity=1.0,
            size=dotsize
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Kind: %{customdata[1]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            rn_node_df.loc[mask, "node"],
            rn_node_df.loc[mask, "node_type"],
            rn_node_df.loc[mask, "mean_null_node_count"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# Pairs

# --- "Other" pairs ---
mask_other = rn_pair_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=rn_pair_df.loc[mask_other, "period"],
    y=rn_pair_df.loc[mask_other, "mean_null_pair_count"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=pair_palette["Other"],
        opacity=other_opacity,
        size=dotsize,
        symbol='pentagon'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Pair: %{customdata[0]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        rn_pair_df.loc[mask_other, "names"],
        rn_pair_df.loc[mask_other, "mean_null_pair_count"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# --- Highlighted pairs (jittered) ---
for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = rn_pair_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=rn_pair_df.loc[mask, "period"],
        y=rn_pair_df.loc[mask, "mean_null_pair_count"],
        boxpoints='all',
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=pair_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='pentagon'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Pair: %{customdata[0]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            rn_pair_df.loc[mask, "names"],
            rn_pair_df.loc[mask, "mean_null_pair_count"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)


# Triangles
# --- "Other" triangles ---
mask_other = rn_tri_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=rn_tri_df.loc[mask_other, "period"],
    y=rn_tri_df.loc[mask_other, "mean_null_triangle_count"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=triangle_palette["Other"],
        opacity=other_opacity,
        size=dotsize,
        symbol='triangle-up'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Triangle: %{customdata[0]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        rn_tri_df.loc[mask_other, "names"],
        rn_tri_df.loc[mask_other, "mean_null_triangle_count"]
    ), axis=-1),
    showlegend=True
), row=1, col=3)

# --- Highlighted triangles (jittered) ---

for label in sorted(taxonomy.TRIANGLE_HIGHLIGHTS.values()):
    mask = rn_tri_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=rn_tri_df.loc[mask, "period"],
        y=rn_tri_df.loc[mask, "mean_null_triangle_count"],
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_triplet[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=triangle_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='triangle-up'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Triangle: %{customdata[0]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            rn_tri_df.loc[mask, "names"],
            rn_tri_df.loc[mask, "mean_null_triangle_count"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=3)

        
rng_boot = np.random.default_rng(42)

global_node_stats = figure_build.bootstrap_global_mean_ci(
    rn_summary.loc[rn_summary["metric"] == "node_count", "tau_mean"].values,
    n_boot=5000,
    conf=0.95,
    rng=rng_boot,
)

global_pair_stats = figure_build.bootstrap_global_mean_ci(
    rn_summary.loc[rn_summary["metric"] == "pair_count", "tau_mean"].values,
    n_boot=5000,
    conf=0.95,
    rng=rng_boot,
)

global_tri_stats = figure_build.bootstrap_global_mean_ci(
    rn_summary.loc[rn_summary["metric"] == "triangle_count", "tau_mean"].values,
    n_boot=5000,
    conf=0.95,
    rng=rng_boot,
)

kendall_node_count = figure_build.add_kendall_overlay_no_compute(rn_summary, "node_count", subplots, 1, 1, period_order, 10, True, False, True, global_node_stats)
kendall_pair_count = figure_build.add_kendall_overlay_no_compute(rn_summary, "pair_count", subplots, 1, 2, period_order, 11, True, False, True, global_pair_stats)
kendall_triangle_count = figure_build.add_kendall_overlay_no_compute(rn_summary, "triangle_count", subplots, 1, 3, period_order, 12, True, True, True, global_tri_stats)

subplots.update_xaxes(categoryorder="array", categoryarray=period_order,   tickvals=list(range(len(period_order))), ticktext=[period_labels[p] for p in period_order], row=1)

subplots.update_yaxes(row=1, col=1, title_text="Node Count (Random Model)", title_standoff=1)
subplots.update_yaxes(row=1, col=2, title_text="Pair Count (Random Model)", title_standoff=1)
subplots.update_yaxes(row=1, col=3, title_text="Triangle Count (Random Model)", title_standoff=1)

subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':10,'b':2,'r':10}
)


In [18]:
subplots.write_image(f"{path}/results/feature-only-KG/img/fig2_2_random_model.svg", 
                     width=cm_to_px(18), height=cm_to_px(6))

Perhaps it's more correct to use the kendall's tau from the MC (R=1000) + MC p-value rather than the bootstrap tau + 95% CI. 

# Fig S5. Random Model - Betweenness

In [8]:
import plotly.graph_objects as go
import plotly.subplots as sp

# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=2, column_widths=[0.5, 0.5], horizontal_spacing=0.1,)

# --- "Other" nodes first (no jitter) ---
mask_other = rn_node_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=rn_node_df.loc[mask_other, "period"],
    y=rn_node_df.loc[mask_other, "mean_null_betweenness"],
    boxpoints='all',
    jitter=0.3,
    pointpos=0,
    whiskerwidth=0,
    line=dict(width=0),
    fillcolor='rgba(0,0,0,0)',
    marker=dict(color=node_palette["Other"], 
                opacity=other_opacity, size=dotsize),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Node: %{customdata[0]}<br>"
        "Kind: %{customdata[1]}<br>"
        "Betweenness: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        rn_node_df.loc[mask_other, "node"],
        rn_node_df.loc[mask_other, "node_type"],
        rn_node_df.loc[mask_other, "mean_null_betweenness"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# --- Highlighted groups with jitter ---

for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = rn_node_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=rn_node_df.loc[mask, "period"],
        y=rn_node_df.loc[mask, "mean_null_betweenness"],
        boxpoints='all',
        jitter=0.7,          # horizontal jitter
        pointpos=point_offsets_nodes[label],          # centered at category
        whiskerwidth=0,      # no whiskers
        line=dict(width=0),  # no box outline
        fillcolor='rgba(0,0,0,0)',  # transparent background
        marker=dict(
            color=node_palette[label],
            opacity=1.0,
            size=dotsize
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Kind: %{customdata[1]}<br>"
            "Betweenness: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            rn_node_df.loc[mask, "node"],
            rn_node_df.loc[mask, "node_type"],
            rn_node_df.loc[mask, "mean_null_betweenness"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)


# Pairs

# --- "Other" pairs ---
mask_other = rn_pair_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=rn_pair_df.loc[mask_other, "period"],
    y=rn_pair_df.loc[mask_other, "mean_null_edge_betweenness_weighted"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=pair_palette["Other"],
        opacity=other_opacity,
        size=dotsize,
        symbol='pentagon'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Pair: %{customdata[0]}<br>"
        "Edge Betweenness: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        rn_pair_df.loc[mask_other, "names"],
        rn_pair_df.loc[mask_other, "mean_null_edge_betweenness_weighted"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# --- Highlighted pairs (jittered) ---
for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = rn_pair_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=rn_pair_df.loc[mask, "period"],
        y=rn_pair_df.loc[mask, "mean_null_edge_betweenness_weighted"],
        boxpoints='all',
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=pair_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='pentagon'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Pair: %{customdata[0]}<br>"
            "Edge Betweenness: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            rn_pair_df.loc[mask, "names"],
            rn_pair_df.loc[mask, "mean_null_edge_betweenness_weighted"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)
        
rng_boot = np.random.default_rng(42)

global_node_stats = figure_build.bootstrap_global_mean_ci(
    rn_summary.loc[rn_summary["metric"] == "betweenness", "tau_mean"].values,
    n_boot=5000,
    conf=0.95,
    rng=rng_boot,
)

global_pair_stats = figure_build.bootstrap_global_mean_ci(
    rn_summary.loc[rn_summary["metric"] == "edge_betweenness_weighted", "tau_mean"].values,
    n_boot=5000,
    conf=0.95,
    rng=rng_boot,
)


kendall_node_count = figure_build.add_kendall_overlay_no_compute(rn_summary, "node_count", subplots, 1, 1, period_order, 10, True, False, True, global_node_stats)
kendall_pair_count = figure_build.add_kendall_overlay_no_compute(rn_summary, "pair_count", subplots, 1, 2, period_order, 11, True, True, True, global_pair_stats)

subplots.update_xaxes(categoryorder="array", categoryarray=period_order,   tickvals=list(range(len(period_order))), ticktext=[period_labels[p] for p in period_order], row=1)

subplots.update_yaxes(row=1, col=1, title_text="Node Count (Random Model)", title_standoff=1)
subplots.update_yaxes(row=1, col=2, title_text="Pair Count (Random Model)", title_standoff=1)

subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':10,'b':2,'r':10}
)


In [11]:
subplots.write_image(f"{path}/results/feature-only-KG/img/fig3_2_betweenness_random_model.svg", 
                     width=cm_to_px(12), height=cm_to_px(6))

# Fig 3. Betweenness

In [9]:
# Load Datasets--------------------------------------------
# Just in case it is modified later :-)
node_btw = data_loads.all_top_betweenness_df.copy()
edge_btw = data_loads.all_edge_betweenness_df.copy()

# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=2, 
                            column_widths=[0.5, 0.5], 
                            horizontal_spacing=0.15,)

# ------------------------------
# A) Plot degree distribution over time
# ------------------------------
# Add "Other" nodes first as box plots to subplot col 1
other_mask_nodes = node_btw["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=node_btw[other_mask_nodes]["period"],
    y=node_btw[other_mask_nodes]["score_norm"],
    name="Other",
    legendgroup="Other",
    marker=dict(color=node_palette["Other"], 
                opacity=other_opacity, size=dotsize),
    line=dict(width=0),
    whiskerwidth=0,
    fillcolor='rgba(0,0,0,0)',
    boxpoints="all",
    jitter=0.3,
    pointpos=0,
    hovertemplate=
        "Node: %{customdata[0]}<br>" +
        "Kind: %{customdata[1]}<br>" +
        "Betweenness: %{y}<br><extra></extra>",
    customdata=np.stack((
        node_btw[other_mask_nodes]["short_name"],
        node_btw[other_mask_nodes]["kind"],
        node_btw[other_mask_nodes]["score_norm"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# Add highlighted nodes in specified order as box plots to subplot col 1
for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = node_btw["color_group"] == label
    subplots.add_trace(go.Box(
        x=node_btw[mask]["period"],
        y=node_btw[mask]["score_norm"],
        name=label,
        legendgroup=label,
        marker=dict(color=node_palette[label], opacity=1, size=dotsize),
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_nodes[label],
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Betweenness: %{y}<br><extra></extra>",
        customdata=np.stack((
            node_btw[mask]["short_name"],
            node_btw[mask]["kind"],
            node_btw[mask]["score_norm"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# ------------------------------
# B) Edge Betweenness over time (box plots)
# ------------------------------

other_mask_pairs = edge_btw["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=edge_btw[other_mask_pairs]["period"],
    y=edge_btw[other_mask_pairs]["edge_betweenness_weighted"],
    name="Other",
    legendgroup="Other",
    marker=dict(color=pair_palette["Other"],
                opacity=other_opacity, size=dotsize),
    line=dict(width=0),
    whiskerwidth=0,
    fillcolor='rgba(0,0,0,0)',
    boxpoints="all",
    jitter=0.3,
    pointpos=0,
    hovertemplate=
        "Pair: %{customdata[0]}<br>" +
        "Betweenness: %{y}<br>" +
        "<extra></extra>",
    customdata=np.stack((
        edge_btw[other_mask_pairs]["short_name"],
        edge_btw[other_mask_pairs]["edge_betweenness_weighted"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# Add highlighted pairs in specified order as box plots to subplot col 2
for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = edge_btw["color_group"] == label
    subplots.add_trace(go.Box(
        x=edge_btw[mask]["period"],
        y=edge_btw[mask]["edge_betweenness_weighted"],
        name=label,
        legendgroup=label,
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize),
        line=dict(width=0),
        whiskerwidth=0,
        fillcolor='rgba(0,0,0,0)',
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Betweenness: %{y}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            edge_btw[mask]["short_name"],
            edge_btw[mask]["edge_betweenness_weighted"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)

 
degree_kendall_results = figure_build.add_kendall_overlay(
    subplots, node_btw, value_col="score_norm", period_col="period",
    row=1, col=1, period_order=period_order, overlay_idx=10, show_overlay_x=True, show_overlay_y=False
)

pair_kendall_results = figure_build.add_kendall_overlay(
    subplots, edge_btw, value_col="edge_betweenness_weighted", period_col="period",
    row=1, col=2, period_order=period_order, overlay_idx=11, show_overlay_x=True, show_overlay_y=True
)


subplots.update_xaxes(categoryorder="array", categoryarray=period_order,   tickvals=list(range(len(period_order))),ticktext=[period_labels[p] for p in period_order], row=1)
subplots.update_yaxes(row=1, col=1, title_text="Node Betweenness", title_standoff=1)
subplots.update_yaxes(row=1, col=2, title_text="Edge Betweenness", title_standoff=1)


subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':2,'b':2,'r':2}
)

# Add panel labels "(a)" and "(b)" above each subplot
panel_labels = ['(a)', '(b)']
# paper x positions for two columns (small left margin and halfway)
x_positions = [-0.03, 0.544455]

existing = list(subplots.layout.annotations) if (hasattr(subplots.layout, "annotations") and subplots.layout.annotations is not None) else []
for lbl, x in zip(panel_labels, x_positions):
    existing.append(dict(
        text=lbl,
        x=x,
        xref='paper',
        xanchor='left',
        y=1.02,
        yref='paper',
        showarrow=False,
        font=dict(family='Arial', size=8, color='black')
    ))

subplots.update_layout(annotations=existing)


In [21]:
subplots.write_image(f"{path}/results/feature-only-KG/img/fig3_normed.svg",
                        width=cm_to_px(12), height=cm_to_px(6))

# Fig S1. Unweighted Betweenness

In [11]:
# Load Datasets--------------------------------------------
# Just in case it is modified later :-)
node_btw = data_loads.all_top_betweenness_noweight_df.copy()
edge_btw = data_loads.all_edge_betweenness_df.copy()


# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=2, 
                            column_widths=[0.5, 0.5], 
                            horizontal_spacing=0.15,)

# ------------------------------
# A) Plot degree distribution over time
# ------------------------------
# Add "Other" nodes first as box plots to subplot col 1
other_mask_nodes = node_btw["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=node_btw[other_mask_nodes]["period"],
    y=node_btw[other_mask_nodes]["score_norm"],
    name="Other",
    legendgroup="Other",
    marker=dict(color=node_palette["Other"], 
                opacity=other_opacity, size=dotsize),
    line=dict(width=0),
    whiskerwidth=0,
    fillcolor='rgba(0,0,0,0)',
    boxpoints="all",
    jitter=0.3,
    pointpos=0,
    hovertemplate=
        "Node: %{customdata[0]}<br>" +
        "Kind: %{customdata[1]}<br>" +
        "Betweenness: %{y}<br><extra></extra>",
    customdata=np.stack((
        node_btw[other_mask_nodes]["short_name"],
        node_btw[other_mask_nodes]["kind"],
        node_btw[other_mask_nodes]["score_norm"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# Add highlighted nodes in specified order as box plots to subplot col 1
for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = node_btw["color_group"] == label
    subplots.add_trace(go.Box(
        x=node_btw[mask]["period"],
        y=node_btw[mask]["score_norm"],
        name=label,
        legendgroup=label,
        marker=dict(color=node_palette[label], opacity=1, size=dotsize),
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_nodes[label],
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Betweenness: %{y}<br><extra></extra>",
        customdata=np.stack((
            node_btw[mask]["short_name"],
            node_btw[mask]["kind"],
            node_btw[mask]["score_norm"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# ------------------------------
# B) Edge Betweenness over time (box plots)
# ------------------------------

other_mask_pairs = edge_btw["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=edge_btw[other_mask_pairs]["period"],
    y=edge_btw[other_mask_pairs]["edge_betweenness"],
    name="Other",
    legendgroup="Other",
    marker=dict(color=pair_palette["Other"],
                opacity=other_opacity, size=dotsize),
    line=dict(width=0),
    whiskerwidth=0,
    fillcolor='rgba(0,0,0,0)',
    boxpoints="all",
    jitter=0.3,
    pointpos=0,
    hovertemplate=
        "Pair: %{customdata[0]}<br>" +
        "Betweenness: %{y}<br>" +
        "<extra></extra>",
    customdata=np.stack((
        edge_btw[other_mask_pairs]["short_name"],
        edge_btw[other_mask_pairs]["edge_betweenness"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# Add highlighted pairs in specified order as box plots to subplot col 2
for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = edge_btw["color_group"] == label
    subplots.add_trace(go.Box(
        x=edge_btw[mask]["period"],
        y=edge_btw[mask]["edge_betweenness"],
        name=label,
        legendgroup=label,
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize),
        line=dict(width=0),
        whiskerwidth=0,
        fillcolor='rgba(0,0,0,0)',
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Betweenness: %{y}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            edge_btw[mask]["short_name"],
            edge_btw[mask]["edge_betweenness"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)

 



degree_kendall_results = figure_build.add_kendall_overlay(
    subplots, node_btw, value_col="score_norm", period_col="period",
    row=1, col=1, period_order=period_order, overlay_idx=10, show_overlay_x=True, show_overlay_y=False
)

pair_kendall_results = figure_build.add_kendall_overlay(
    subplots, edge_btw, value_col="edge_betweenness", period_col="period",
    row=1, col=2, period_order=period_order, overlay_idx=11, show_overlay_x=True, show_overlay_y=True
)


subplots.update_xaxes(categoryorder="array", categoryarray=period_order,   tickvals=list(range(len(period_order))),ticktext=[period_labels[p] for p in period_order], row=1)
subplots.update_yaxes(row=1, col=1, title_text="Node Betweenness (Unweighted)", title_standoff=1)
subplots.update_yaxes(row=1, col=2, title_text="Edge Betweenness (Unweighted)", title_standoff=1)


subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':2,'b':2,'r':2}
)

# Add panel labels "(a)" and "(b)" above each subplot
panel_labels = ['(a)', '(b)']
# paper x positions for two columns (small left margin and halfway)
x_positions = [-0.03, 0.544455]

existing = list(subplots.layout.annotations) if (hasattr(subplots.layout, "annotations") and subplots.layout.annotations is not None) else []
for lbl, x in zip(panel_labels, x_positions):
    existing.append(dict(
        text=lbl,
        x=x,
        xref='paper',
        xanchor='left',
        y=1.02,
        yref='paper',
        showarrow=False,
        font=dict(family='Arial', size=8, color='black')
    ))

subplots.update_layout(annotations=existing)


In [9]:
subplots.write_image(f"{path}/results/feature-only-KG/img/fig3_noweight.svg",
                        width=cm_to_px(12), height=cm_to_px(6))

# Fig S2. Strength vs. Degree \& D/2*#Triplets vs. Betweenness

In [14]:
# Load Datasets--------------------------------------------
all_norm_degree_df = data_loads.all_norm_degree_df.copy()
strength_df = data_loads.all_top_strength_df.copy()
deg_df = data_loads.all_top_degree_df.copy()


strength_df = strength_df.sort_values(by="period", ascending=True)
deg_df = deg_df.sort_values(by="period", ascending=True)

subplot_titles = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]
subplot_titles = [title + " " + period for title, period in zip(subplot_titles, period_order*2)]

fig = sp.make_subplots(rows=4, cols=3, subplot_titles=subplot_titles, vertical_spacing=0.09)

for idx, period in enumerate(period_order):
    if period not in strength_df['period'].values and period not in deg_df['period'].values:
        continue

    period_data = pd.merge(
        strength_df[strength_df['period'] == period],
        deg_df[deg_df['period'] == period],
        on=['node', 'period', 'kind', 'color_group'],
        suffixes=('_strength', '_degree')
    )

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1
    
    # y values: use strength+1 so log axis is valid (no zeros)
    y_vals = period_data['strength'] + 1

    # Add scatter points (plotting strength+1 on log y-axis)
    fig.add_trace(go.Scatter(
        x=period_data['score'], 
        y=y_vals,
        mode='markers',
        marker=dict(
            size=4,
            color=[node_palette.get(cg, "#C7C7C7") for cg in period_data['color_group']]
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Degree: %{x}<br>" +
            "Strength: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['strength']), axis=-1),
        showlegend=False
    ), row=row, col=col)
    
    # Fit an OLS line in log-space: log10(strength+1) ~ score
    mask = (~period_data['score'].isna()) & (~period_data['strength'].isna())
    x = period_data.loc[mask, 'score'].to_numpy()
    y_log = np.log10(period_data.loc[mask, 'strength'] + 1).to_numpy()
    if len(x) > 1:
        z = np.polyfit(x, y_log, 1)
        x_trend = np.linspace(x.min(), x.max(), 100)
        # convert back from log10 to linear (strength+1)
        y_trend = 10 ** (np.polyval(z, x_trend))
    else:
        x_trend = np.array([])
        y_trend = np.array([])
    
    # Add trendline (on the same log y-axis)
    fig.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=1),
            showlegend=False,
            hoverinfo='skip',
        ),
        row=row, col=col
    )

    # Set y-axis to log for this subplot (strength plotted as strength+1)
    fig.update_yaxes(type="log", row=row, col=col)


# Normalized Degree vs Triplets ------------------------------------------------------------------

norm_plt = all_norm_degree_df.sort_values(by=["period","degree"], ascending=[True,False])
norm_plt['color_group'] = norm_plt['node'].map(taxonomy.NODE_HIGHLIGHTS).fillna("Other")

for idx, period in enumerate(period_order):
    if period not in norm_plt['period'].values:
        continue

    period_data = norm_plt[norm_plt['period'] == period]

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 3
    
    # Add scatter points
    fig.add_trace(go.Scatter(
        # x=period_data['triplets'], 
        x=period_data['degree'],
        y=period_data['normalized_degree'],
        mode='markers', 
        marker=dict(
            size=4,
            color=[node_palette.get(cg, "#C7C7C7") for cg in period_data['color_group']],
            symbol="circle"
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "#Triplets: %{x}<br>" +
            "Normalized_degree: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['normalized_degree']), axis=-1),
        showlegend=False
    ), row=row, col=col)  # Added row and col here
    
    # Fit a powerlaw: normalized_degree = a * triplets^b
    # Only use points with triplets > 0 and normalized_degree > 0
    # mask = (period_data['triplets'] > 0) & (period_data['normalized_degree'] > 0)
    mask = (period_data['degree'] > 0) & (period_data['normalized_degree'] > 0)
    # x = period_data.loc[mask, 'triplets'].to_numpy()
    x = period_data.loc[mask, 'degree'].to_numpy()
    y = period_data.loc[mask, 'normalized_degree'].to_numpy()
    if len(x) > 1 and len(y) > 1:
        # log-log fit: log(y) = log(a) + b*log(x)
        logx = np.log(x)
        logy = np.log(y)
        # b, loga = np.polyfit(logx, y, 1)
        b, loga = np.polyfit(logx, logy, 1)
        a = np.exp(loga)
        x_trend = np.linspace(x.min(), x.max(), 100)
        y_trend = a * x_trend ** b
    else:
        x_trend = np.array([])
        y_trend = np.array([])
        
    # Add trendline
    fig.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=1),
            showlegend=False,
            hoverinfo='skip',
            
        ),
        row=row, col=col
    )

fig.update_layout(
    template=template_type, 
    showlegend=showlegend,
    font=dict(family="Arial", size=7),
    margin={'t':5,'l':5,'b':5,'r':5},
)

# 1) Fix subplot title sizing (safe form)
fig.update_annotations(font=dict(size=7))

# 2) Apply axis updates by looping rows/cols (no lists)
for r in (1, 2):
    fig.update_yaxes(range=[0, 3], row=r)
    for c in (1, 2, 3):
        fig.update_xaxes(range=[0, 20], row=r, col=c)
        if c == 1:
            fig.update_yaxes(title_text="Strength+1", ticklabelstandoff=1, title_standoff=1, row=r, col=c)
for c in (1, 2, 3):
    fig.update_xaxes(title_text="Degree", row=2, col=c)

# 3) Log axes: double-check ranges (these are log10 units!)
# If you really want triplets in [1, 100] and normalized_degree in [0.1, 1]:
for r in (3, 4):
    if r== 4:
        fig.update_xaxes(title_text="Degree", row=r)
    for c in (1, 2, 3):
        fig.update_xaxes(type="log", autorange=False, range=[0, 2], row=r, col=c)
        # fig.update_xaxes(type="log", autorange=False, range=[0, 2], row=r, col=c)
        fig.update_yaxes(type="log", title_text=("degree/2#Triplets" if c == 1 else None),
                         range=[-1, 0], ticklabelstandoff=1,title_standoff=1, row=r, col=c)

# 4) Titles/labels spacing (optional)
fig.update_xaxes(title_standoff=0)



fig.show()


In [13]:
fig.write_image(f"{path}/results/feature-only-KG/img/fig4.svg",
                     width=cm_to_px(18), height=cm_to_px(11.5))

# Fig 4. Capacitance
* Unweighted node capacitance
* Weighted node capacitance
* Unweighted edge capacitance

In [16]:
deg_df = data_loads.all_top_degree_df
strength_df = data_loads.all_top_strength_df
pair_df = data_loads.pairs
node_btw_noweight = data_loads.all_top_betweenness_noweight_df
node_btw = data_loads.all_top_betweenness_df

In [18]:
# Define Edge Degree
pair_df['edge_degree'] = pair_df['from_degree'] + pair_df['to_degree'] - 2

In [19]:
dot_size = 4


# Active nodes per period (degree > 0)
C_nodes = (deg_df.loc[deg_df['score'] > 0, ['period', 'short_name']]
                 .drop_duplicates()
                 .groupby('period')
                 .size()
                 .rename('C_nodes')
                 .reset_index())

M_nodes = (strength_df.loc[strength_df['strength'] > 0, ['period', 'short_name', 'strength']]
               .drop_duplicates()
               .groupby('period')['strength']
               .agg(M_nodes='max', C_nodes='size')
               .reset_index())

# --- Merge bases (you already have these) ---
data = pd.merge(
    deg_df.sort_values('period'),
    node_btw_noweight.sort_values('period'),
    on=['short_name','period','kind'],
    suffixes=('_degree','_betweenness')
)

data_2 = pair_df.copy()

data_3 = pd.merge(
    strength_df.sort_values('period'),
    node_btw.sort_values('period'),
    on=['short_name','period','kind','color_group'],
    suffixes=('_strength','_betweenness')
)

# --- Attach sizes per period ---
dataC   = data.merge(C_nodes, on='period', how='left')
data2E  = data_2.merge(C_nodes, on='period', how='left')
data3C  = data_3.merge(M_nodes, on='period', how='left')  # reuse node counts

# --- Per-(entity, period) aggregates + normalized ratios ---

# Nodes: A = mean betweenness metric (score_norm), B = mean degree; normalize by C_nodes
per_nodes = (dataC.groupby(['short_name','period'], as_index=False)
                  .agg(A=('score_norm','mean'),
                       B=('score_degree','mean'),
                       C_nodes=('C_nodes','first')))

per_nodes['bet_deg_ratio_raw'] = np.where(per_nodes['B']>0,
                                          per_nodes['A']/per_nodes['B'],
                                          np.nan)
per_nodes['bet_deg_ratio'] = np.where(per_nodes['B']>0,
                                      per_nodes['A'] * (per_nodes['C_nodes']-1) / per_nodes['B'],
                                      np.nan)

# Pairs: A = mean weighted edge-betweenness, B = mean pair count; normalize by E_pairs

per_pairs = (data2E.groupby(['short_name','period'], as_index=False)
                 .agg(A=('edge_betweenness','mean'),
                      B=('edge_degree','mean'),
                      C_nodes=('C_nodes','first')))

per_pairs['bet_edge_ratio'] = np.where(per_pairs['B']>0,
                                       per_pairs['A'] * (2*per_pairs['C_nodes']-2) / per_pairs['B'],
                                       np.nan)

per_pairs['bet_edge_ratio_raw'] = np.where(per_pairs['B']>0,
                                           per_pairs['A']/per_pairs['B'],
                                           np.nan)

# Strength: A = mean betweenness metric (score_norm), B = mean strength; normalize by C_nodes
per_strength = (data3C.groupby(['short_name','period'], as_index=False)
                    .agg(A=('score_norm','mean'),
                         B=('strength','mean'),
                         C_nodes=('C_nodes','first'),
                         M_nodes=('M_nodes','first')))

per_strength['bet_strength_ratio_raw'] = np.where(per_strength['B']>0,
                                                  per_strength['A']/per_strength['B'],
                                                  np.nan)
per_strength['bet_strength_ratio'] = np.where(per_strength['B']>0,
                                              per_strength['A'] * per_strength['M_nodes'] / per_strength['B'],
                                              np.nan)

# --- Optional: enforce your period order ---
if 'period_order' in globals():
    for df in (per_nodes, per_pairs, per_strength):
        df['period'] = pd.Categorical(df['period'], categories=period_order, ordered=True)
        df.sort_values(['period'], inplace=True)

# --- Collapse across periods (mean of ratios per entity) ---
group   = (per_nodes.groupby('short_name', as_index=False)
                   .agg(bet_deg_ratio=('bet_deg_ratio','mean'),
                        bet_deg_ratio_raw=('bet_deg_ratio_raw','mean'),
                        avg_degree=('B','mean'),
                        med_degree=('B','median'),
                        bet_deg_ratio_median=('bet_deg_ratio','median'),
                        bet_deg_ratio_std=('bet_deg_ratio','std'),
                        bet_deg_ratio_min=('bet_deg_ratio','min'),
                        bet_deg_ratio_max=('bet_deg_ratio','max')))

group['rank'] = group['bet_deg_ratio_median'].rank(ascending=True, pct=True)

group_2 = (per_pairs.groupby('short_name', as_index=False)
                   .agg(bet_edge_ratio=('bet_edge_ratio','mean'),
                        bet_edge_ratio_raw=('bet_edge_ratio','mean'),
                        avg_pair_count=('B','mean'),
                        med_pair_count=('B','median'),
                        bet_edge_ratio_median=('bet_edge_ratio','median'),
                        bet_edge_ratio_std=('bet_edge_ratio','std'),
                        bet_edge_ratio_min=('bet_edge_ratio','min'),
                        bet_edge_ratio_max=('bet_edge_ratio','max')))

group_2['rank'] = group_2['bet_edge_ratio_median'].rank(ascending=True, pct=True)

group_3 = (per_strength.groupby('short_name', as_index=False)
                     .agg(bet_strength_ratio=('bet_strength_ratio','mean'),
                          bet_strength_ratio_raw=('bet_strength_ratio_raw','mean'),
                          avg_strength=('B','mean'),
                          med_strength=('B','median'),
                          bet_strength_ratio_median=('bet_strength_ratio','median'),
                          bet_strength_ratio_std=('bet_strength_ratio','std'),
                          bet_strength_ratio_min=('bet_strength_ratio','min'),
                          bet_strength_ratio_max=('bet_strength_ratio','max')))

group_3['rank'] = group_3['bet_strength_ratio_median'].rank(ascending=True, pct=True)


subfig = sp.make_subplots(rows=1, cols=3, 
                            column_widths=[0.33, 0.33, 0.33], 
                            horizontal_spacing=0.05)


others_mask_nodes = ~group['short_name'].isin(taxonomy.NODE_HIGHLIGHTS.values())
top_group = group.loc[others_mask_nodes][group['rank'] >= 0.95].copy()
rest = group.loc[others_mask_nodes][group['rank'] < 0.95].copy()

subfig.add_trace(go.Scatter(
    x= top_group.loc[others_mask_nodes,'short_name'],
    y= top_group.loc[others_mask_nodes,'bet_deg_ratio_median'],
    mode = 'markers+text',
    marker=dict(size=dot_size, symbol="circle", color="#7BB0DF"),
    text=top_group.loc[others_mask_nodes,'short_name'],
    textposition="middle right",
    hovertemplate= 
        "Node: %{customdata[0]}<br>" +
        "Betweenness/Degree Ratio: %{y}<br><extra></extra>" +
        "Degree: %{x}",
    customdata=np.stack((
        top_group[others_mask_nodes]['short_name'],
        top_group[others_mask_nodes]['bet_deg_ratio_median'] 
    ), axis=-1
)), row=1, col=1)

subfig.add_trace(go.Scatter(
    x= rest.loc[others_mask_nodes,'short_name'],
    y= rest.loc[others_mask_nodes,'bet_deg_ratio_median'],
    mode = 'markers',
    marker=dict(size=dot_size, symbol="circle", color=node_palette["Other"], opacity=other_opacity),
    error_y = dict(type = 'data', 
                symmetric = False,
                array = rest.loc[others_mask_nodes,'bet_deg_ratio_max'] - rest.loc[others_mask_nodes,'bet_deg_ratio_median'],       # upward to max
                arrayminus = rest.loc[others_mask_nodes,'bet_deg_ratio_median'] - rest.loc[others_mask_nodes,'bet_deg_ratio_min'],  # downward to min
                visible = True,
                width = 1,
                thickness = 1,
                color = f"rgba(64, 64, 64, {other_opacity})"
                ),
    hovertemplate= 
        "Node: %{customdata[0]}<br>" +
        "Betweenness/Degree Ratio: %{y}<br><extra></extra>" +
        "Degree: %{x}<br>"+
        "Min-Max: %{customdata[2]} – %{customdata[3]}",
    customdata=np.stack((
        rest[others_mask_nodes]['short_name'],
        rest[others_mask_nodes]['bet_deg_ratio_median'],
        rest[others_mask_nodes]['bet_deg_ratio_min'],
        rest[others_mask_nodes]['bet_deg_ratio_max'],
    ), axis=-1
)), row=1, col=1)

for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = group['short_name'] == label
    subfig.add_trace(go.Scatter(
        x=group.loc[mask, 'short_name'],
        y=group.loc[mask, 'bet_deg_ratio_median'],
        mode='markers',
        marker=dict(size=dot_size, symbol="circle", color=node_palette[label]),
        error_y=dict(
            type='data',
            symmetric=False,
            array=group.loc[mask, 'bet_deg_ratio_max'] - group.loc[mask, 'bet_deg_ratio_median'],       # upward to max
            arrayminus=group.loc[mask, 'bet_deg_ratio_median'] - group.loc[mask, 'bet_deg_ratio_min'],  # downward to min
            visible=True,
            width=1,
            thickness=1
        ),
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Mean ratio (bet/deg): %{y:.3f}<br>"
            "Median ratio (bet/deg): %{customdata[1]:.3f}<br>"
            "Min–Max: %{customdata[2]:.3f} – %{customdata[3]:.3f}<br>"
            "Degree (mean x): %{x:.3f}<extra></extra>"
        ),
        customdata=np.stack((
            group.loc[mask, 'short_name'],
            group.loc[mask, 'bet_deg_ratio_median'],
            group.loc[mask, 'bet_deg_ratio_min'],
            group.loc[mask, 'bet_deg_ratio_max']
        ), axis=-1),
        showlegend=False
    ), row=1, col=1)


others_mask_pairs = ~group_2['short_name'].isin(taxonomy.PAIR_HIGHLIGHTS.values())
top_group_2 = group_2.loc[others_mask_pairs][group_2['rank'] > 0.95].copy()
rest_2 = group_2.loc[others_mask_pairs][group_2['rank'] <= 0.95].copy()

for name in top_group_2.loc[others_mask_pairs,'short_name'].unique():
    mask = top_group_2['short_name'] == name
    subfig.add_trace(go.Scatter(
        x= top_group_2.loc[mask,'short_name'],
        y=top_group_2.loc[mask,'bet_edge_ratio_median'],
        mode = 'markers+text',
        marker=dict(size=dot_size, symbol="pentagon", color="#7BB0DF"),
        text=top_group_2.loc[mask,'short_name'] + f"[{top_group_2.loc[mask, 'bet_edge_ratio_min'].values[0]:.2f}–{top_group_2.loc[mask, 'bet_edge_ratio_max'].values[0]:.2f}]",
        textposition="middle right",
        error_y= dict(type = 'data',
                    symmetric = False,
                    array = top_group_2.loc[mask,'bet_edge_ratio_max'] - top_group_2.loc[mask,'bet_edge_ratio_median'],       # upward to max
                    arrayminus = top_group_2.loc[mask,'bet_edge_ratio_median'] - top_group_2.loc[mask,'bet_edge_ratio_min'],  # downward to min
                    visible = True,
                    width = 1,
                    thickness = 1
                    ),
        hovertemplate= 
            "Pair: %{customdata[0]}<br>" +
            "Betweenness/Count Ratio: %{y}<br><extra></extra>" +
            "Count: %{x}",
        customdata=np.stack((
            top_group_2[mask]['short_name'],
            top_group_2[mask]['bet_edge_ratio_median'] 
        ), axis=-1
    )), row=1, col=3)


subfig.add_trace(go.Scatter(
    x= rest_2.loc[others_mask_pairs,'short_name'],
    y=rest_2.loc[others_mask_pairs,'bet_edge_ratio_median'],
    mode = 'markers',
    marker=dict(size=dot_size, symbol="pentagon", color=pair_palette["Other"], opacity=other_opacity),
    error_y = dict(type = 'data', 
                symmetric = False,
                array = rest_2.loc[others_mask_pairs,'bet_edge_ratio_max'] - rest_2.loc[others_mask_pairs,'bet_edge_ratio_median'],       # upward to max
                arrayminus = rest_2.loc[others_mask_pairs,'bet_edge_ratio_median'] - rest_2.loc[others_mask_pairs,'bet_edge_ratio_min'],  # downward to min
                visible = True,
                width = 1,
                thickness = 1,
                color = f"rgba(64, 64, 64, {other_opacity})"
                ),
    hovertemplate= 
        "Pair: %{customdata[0]}<br>" +
        "Betweenness/Count Ratio: %{y}<br><extra></extra>" +
        "Count: %{x}",
    customdata=np.stack((
        rest_2[others_mask_pairs]['short_name'],
        rest_2[others_mask_pairs]['bet_edge_ratio_median'] 
    ), axis=-1
)), row=1, col=3)

for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = group_2['short_name'] == label
    
    subfig.add_trace(go.Scatter(
        x=group_2.loc[mask, 'short_name'],
        y=group_2.loc[mask, 'bet_edge_ratio_median'],
        marker=dict(size=dot_size, symbol="pentagon", color=pair_palette[label]),
        error_y=dict(
            type='data',
            symmetric=False,
            array=group_2.loc[mask, 'bet_edge_ratio_max'] - group_2.loc[mask, 'bet_edge_ratio_median'],
            arrayminus= group_2.loc[mask, 'bet_edge_ratio_median'] - group_2.loc[mask, 'bet_edge_ratio_min'],
            visible=True,
            width =1,
            thickness=1
        ),
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Mean ratio (bet/count): %{y:.3f}<br>"
            "Median ratio (bet/count): %{customdata[1]:.3f}<br>"
            "Min–Max: %{customdata[2]:.3f} – %{customdata[3]:.3f}<br>"
            "Degree (mean x): %{x:.3f}<extra></extra>"
        ),
        customdata=np.stack((
            group_2.loc[mask, 'short_name'],
            group_2.loc[mask, 'bet_edge_ratio_median'],
            group_2.loc[mask, 'bet_edge_ratio_min'],
            group_2.loc[mask, 'bet_edge_ratio_max']
        ), axis=-1),
        showlegend=False
    ), row=1, col=3)




others_mask_nodes = ~group_3['short_name'].isin(taxonomy.NODE_HIGHLIGHTS.values())
top_group_3 = group_3.loc[others_mask_nodes][group_3['rank'] >= 0.95].copy()
rest_3 = group_3.loc[others_mask_nodes][group_3['rank'] < 0.95].copy()

subfig.add_trace(go.Scatter(
    x= top_group_3.loc[others_mask_nodes,'short_name'],
    y= top_group_3.loc[others_mask_nodes,'bet_strength_ratio_median'],
    mode = 'markers+text',
    marker=dict(size=dot_size, symbol="circle", color="#7BB0DF"),
    text=top_group_3.loc[others_mask_nodes,'short_name'],
    textposition="middle right",
    hovertemplate= 
        "Node: %{customdata[0]}<br>" +
        "Betweenness/ Strength Ratio: %{y}<br><extra></extra>" +
        "Strength: %{x}",
    customdata=np.stack((
        top_group_3[others_mask_nodes]['short_name'],
        top_group_3[others_mask_nodes]['bet_strength_ratio_median'] 
    ), axis=-1
)), row=1, col=2)

subfig.add_trace(go.Scatter(
    x= rest_3.loc[others_mask_nodes,'short_name'],
    y= rest_3.loc[others_mask_nodes,'bet_strength_ratio_median'],
    mode = 'markers',
    marker=dict(size=dot_size, symbol="circle", color=node_palette["Other"], opacity=other_opacity),
    error_y= dict(type = 'data',
                symmetric = False,
                array = rest_3.loc[others_mask_nodes,'bet_strength_ratio_max'] - rest_3.loc[others_mask_nodes,'bet_strength_ratio_median'],       # upward to max
                arrayminus = rest_3.loc[others_mask_nodes,'bet_strength_ratio_median'] - rest_3.loc[others_mask_nodes,'bet_strength_ratio_min'],  # downward to min
                visible = True,
                width = 1,
                thickness = 1,
                color = f"rgba(64, 64, 64, {other_opacity})"
                ),
    hovertemplate= 
        "Node: %{customdata[0]}<br>" +
        "Betweenness/ Strength Ratio: %{y}<br><extra></extra>" +
        "Strength: %{x}<br>"+
        "Min-Max: %{customdata[2]} – %{customdata[3]}",
    customdata=np.stack((
        rest_3[others_mask_nodes]['short_name'],
        rest_3[others_mask_nodes]['bet_strength_ratio_median'],
        rest_3[others_mask_nodes]['bet_strength_ratio_min'],
        rest_3[others_mask_nodes]['bet_strength_ratio_max']
), axis=-1
)), row=1, col=2)


for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = group_3['short_name'] == label
    subfig.add_trace(go.Scatter(
        x=group_3.loc[mask, 'short_name'],
        y=group_3.loc[mask, 'bet_strength_ratio_median'], # Changed it to average, bet ratio
        mode='markers',
        marker=dict(size=dot_size, symbol="circle", color=node_palette[label]),
        error_y=dict(
            type='data',
            symmetric=False,
            array=group_3.loc[mask, 'bet_strength_ratio_max'] - group_3.loc[mask, 'bet_strength_ratio_median'],
            arrayminus= group_3.loc[mask, 'bet_strength_ratio_median']-group_3.loc[mask, 'bet_strength_ratio_min'],
            visible=True,
            width=1,
            thickness=1
        ),
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Mean ratio (bet/strength): %{y:.3f}<br>"
            "Median ratio (bet/strength): %{customdata[1]:.3f}<br>"
            "Min–Max: %{customdata[2]:.3f} – %{customdata[3]:.3f}<br>"
            "Degree (mean x): %{x:.3f}<extra></extra>"
        ),
        customdata=np.stack((
            group_3.loc[mask, 'short_name'],
            group_3.loc[mask, 'bet_strength_ratio_median'],
            group_3.loc[mask, 'bet_strength_ratio_min'],
            group_3.loc[mask, 'bet_strength_ratio_max']
        ), axis=-1),
        showlegend=False
    ), row=1, col=2)



subfig.update_layout(
    template=template_type,
    showlegend=showlegend,
    font=dict(family="Arial", size=7),
    margin={'t':2,'l':2,'b':2,'r':2}
)



import re

def natural_key(name):
    """
    Extracts the numeric part of e.g. 'D13' → 13
    """
    match = re.search(r'(\d+)$', name)
    return (re.split(r'\d', name)[0], int(match.group(1)) if match else float('inf'))

def natural_key_pair(name):
    """
    Extracts the numeric parts of e.g. 'M1-R4' → (1, 4)
    """
    parts = name.split('-')
    keys = []
    for part in parts:
        match = re.search(r'(\d+)$', part)
        keys.append((re.split(r'\d', part)[0], int(match.group(1)) if match else float('inf')))
    return tuple(keys)

cats_node = sorted(group['short_name'].unique(), key=natural_key)
cats_pair = sorted(group_2['short_name'].unique(), key=natural_key_pair)


def set_cat_ticks(fig, cats, row, col, label_every=2, major_len=8):
    tickvals = cats
    ticktext = [c if i % label_every == 0 else "" for i, c in enumerate(cats)]

    fig.update_xaxes(
        categoryorder="array",
        categoryarray=cats,
        tickmode="array",
        tickvals=tickvals,
        ticktext=ticktext,   # "" => tick stays, label disappears
        ticks="outside",
        ticklen=major_len,
        showticklabels=True,
        row=row, col=col
    )

set_cat_ticks(subfig, cats_node, row=1, col=1, label_every=2)
set_cat_ticks(subfig, cats_node, row=1, col=2, label_every=2)
set_cat_ticks(subfig, cats_pair, row=1, col=3, label_every=10)


subfig.update_yaxes(title_text="Betweenness / Degree Ratio", row=1, col=1, title_standoff=1)
subfig.update_yaxes(title_text="Betweenness / Count Ratio", row=1, col=3, title_standoff=1)
subfig.update_yaxes(title_text="Betweenness / Strength Ratio", row=1, col=2, title_standoff=1)




Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will 

In [12]:
subfig.write_image(f"{path}/results/feature-only-KG/img/fig6-2_unweighted_edge_cap.svg",
                        width=cm_to_px(18), height=cm_to_px(6))


In [20]:
# aggregate medians and keep a representative 'kind' (first) for coloring
med_deg = data_loads.all_top_degree_df.groupby('short_name').agg(
    median=('score', 'median'),
    min=('score', 'min'),
    max=('score', 'max'),
    kind=('kind', 'first')
).reset_index()

fig = px.scatter(
    med_deg,
    x='short_name',
    y='median',
    error_y=med_deg['max'] - med_deg['median'],
    error_y_minus=med_deg['median'] - med_deg['min'],
    color='kind',  # now exists in med_deg
    title='Top Degree Nodes',
    labels={'median': 'Degree', 'short_name': 'Node'},
    hover_data={
        'short_name': True,
        'median': ':.3f',
    }
)

# Order x-axis categories by the median degree (descending)
cats_node = med_deg.sort_values('median', ascending=False)['short_name'].tolist()
fig.update_xaxes(categoryorder='array', categoryarray=cats_node)

fig.show()
    

In [12]:
main = data_loads.main
main[main['m_short_name'] == 'M7'][main['dt_short_name'] == 'D14']
# main[main['m_short_name'] == 'M7'][main['rq_short_name'] == 'R1']


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,id,title,year,doi,landing_page,abstract_inverted_index,language,is_oa,oa_status,oa_link,...,author_raw_names,author_raw_affiliations,topic_display_names,topic_scores,field_display_names,subfield_display_names,period,m_short_name,dt_short_name,rq_short_name
127,https://openalex.org/W1994441718,Estimation of Opportunity Inequality in Brazil...,2010,https://doi.org/10.1080/00220388.2010.500661,https://doi.org/10.1080/00220388.2010.500661,"{'Abstract': [0, 1], 'This': [2], 'article': [...",en,False,closed,,...,"['Erik Alencar de Figueiredo', 'Flávio Augusto...",[['Department of Economics Universidade Federa...,"['Income, Poverty, and Inequality', 'Intergene...","[0.9958, 0.9884]","['Social Sciences', 'Social Sciences']","['Sociology and Political Science', 'Sociology...",2006-2010,M7,D14,R1
292,https://openalex.org/W2773358030,Distributional Effects of a Continuous Treatme...,2020,https://doi.org/10.1111/obes.12355,https://doi.org/10.1111/obes.12355,"{'Abstract': [0], 'This': [1], 'paper': [2], '...",en,False,closed,,...,"['Brantly Callaway', 'Weige Huang']","[['Department of Economics, University of Miss...",['Intergenerational and Educational Inequality...,"[0.999, 0.9973, 0.9958]","['Social Sciences', 'Social Sciences', 'Mathem...","['Sociology and Political Science', 'Sociology...",2016-2020,M7,D14,R4


# Fig S3. Resurgence

## Nodes

In [27]:
res_node = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_node_approach_allyears.csv")
res_node_w = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_node_approach_papercounts_allyears.csv")
len(res_node), len(res_node_w)

(1500, 1500)

## Edges

In [28]:
res_pair = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_edge_approach_allyears.csv")
res_pair_w = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_edge_approach_papercounts_allyears.csv")
len(res_pair), len(res_pair_w)

(16800, 16800)

## Triplets

In [29]:
res_triple = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_triplet_approach_allyears.csv")
res_triple_w = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_triplet_approach_papercounts_allyears.csv")
len(res_triple), len(res_triple_w)

(7900, 7900)

In [30]:
dotsize = 3
others_dotsize = 1
others_line_width = 0.5
highlight_line_width = 1


res_node["key"] = res_node["key"].apply(graph.norm)
res_node["color_group"] = res_node["key"].map(taxonomy.NODE_HIGHLIGHTS).fillna("Other")

res_pair['key'] = res_pair.apply(lambda r: f"{r['key1']} - {r['key2']}", axis=1)
res_pair["key"] = res_pair['key'].apply(graph.norm)
res_pair["color_group"] = res_pair["key"].map(taxonomy.PAIR_HIGHLIGHTS).fillna("Other")

res_triple['key'] = res_triple.apply(lambda r: f"[{r['datatype']}, {r['measure']}, {r['rqtype']}]",axis=1)
res_triple["key"] = res_triple['key'].apply(graph.norm)
res_triple["color_group"] = res_triple["key"].map(taxonomy.TRIANGLE_HIGHLIGHTS).fillna("Other")

res_node_w['key'] = res_node_w['key'].apply(graph.norm)
res_node_w["color_group"] = res_node_w["key"].map(taxonomy.NODE_HIGHLIGHTS).fillna("Other") 

res_pair_w['key'] = res_pair_w.apply(lambda r: f"{r['key1']} - {r['key2']}", axis=1)
res_pair_w["key"] = res_pair_w['key'].apply(graph.norm)
res_pair_w["color_group"] = res_pair_w["key"].map(taxonomy.PAIR_HIGHLIGHTS).fillna("Other")

res_triple_w['key'] = res_triple_w.apply(lambda r: f"[{r['datatype']}, {r['measure']}, {r['rqtype']}]",axis=1)
res_triple_w["key"] = res_triple_w['key'].apply(graph.norm)
res_triple_w["color_group"] = res_triple_w["key"].map(taxonomy.TRIANGLE_HIGHLIGHTS).fillna("Other")  

triangle_palette = {
    "D1-M1-R2": "#008A69",
    "D1-M1-R4": "#1964B0",
    "D6-R4-M1": "#E9DC6D",
    "D3-M1-R2": "#DB5829",
    "D3-M1-R4": "#882D71",
    "Other": "#DEDEDE"
}


res_node = res_node.sort_values(by="year", ascending=True)
res_pair = res_pair.sort_values(by="year", ascending=True)
res_triple = res_triple.sort_values(by="year", ascending=True)
res_node_w = res_node_w.sort_values(by="year", ascending=True)
res_pair_w = res_pair_w.sort_values(by="year", ascending=True)
res_triple_w = res_triple_w.sort_values(by="year", ascending=True)

subfigs = sp.make_subplots(rows=2, cols=3)

# ---------------------------------------------------
# A) Nodes - Unweighted decay weight over time
# ---------------------------------------------------
for label in ["Other"]:
    others = (res_node[res_node["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=node_palette[label], opacity=0.5, size=others_dotsize, symbol='circle'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Node: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=1, col=1)
        
for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = res_node["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_node[mask]["year"],
        y=res_node[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=node_palette[label], opacity=1, size=dotsize, symbol='circle'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_node[mask]["key"],
            res_node[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# ---------------------------------------------------
# B) Pairs - Unweighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_pair[res_pair["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=pair_palette[label], opacity=0.5, size=others_dotsize, symbol='pentagon'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Pair: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=1, col=2)

for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = res_pair["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_pair[mask]["year"],
        y=res_pair[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize, symbol='pentagon'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_pair[mask]["key"],
            res_pair[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)    


# ---------------------------------------------------
# C) Triangles - Unweighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_triple[res_triple["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=triangle_palette[label], opacity=0.5, size=others_dotsize, symbol='triangle-up'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Triangle: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=1, col=3)
        

for label in taxonomy.TRIANGLE_HIGHLIGHTS.values():
    mask = res_triple["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_triple[mask]["year"],
        y=res_triple[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=triangle_palette[label], opacity=1, size=dotsize, symbol='triangle-up'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Triangle: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_triple[mask]["key"],
            res_triple[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=3)

# ---------------------------------------------------
# D) Nodes - Weighted decay weight over time
# ---------------------------------------------------
for label in ["Other"]:
    others = (res_node_w[res_node_w["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=node_palette[label], opacity=0.5, size=others_dotsize, symbol='circle'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Node: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=2, col=1)
        
for label in taxonomy.NODE_HIGHLIGHTS.values():
    mask = res_node_w["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_node_w[mask]["year"],
        y=res_node_w[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=node_palette[label], opacity=1, size=dotsize, symbol='circle'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_node_w[mask]["key"],
            res_node_w[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=2, col=1)

# ---------------------------------------------------
# E) Pairs - Weighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_pair_w[res_pair_w["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=pair_palette[label], opacity=0.5, size=others_dotsize, symbol='pentagon'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Pair: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=2, col=2)

for label in taxonomy.PAIR_HIGHLIGHTS.values():
    mask = res_pair_w["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_pair_w[mask]["year"],
        y=res_pair_w[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize, symbol='pentagon'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_pair_w[mask]["key"],
            res_pair_w[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=2, col=2)    


# ---------------------------------------------------
# F) Triangles - Weighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_triple_w[res_triple_w["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=triangle_palette[label], opacity=0.5, size=others_dotsize, symbol='triangle-up'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Triangle: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=2, col=3)
        

for label in taxonomy.TRIANGLE_HIGHLIGHTS.values():
    mask = res_triple_w["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_triple_w[mask]["year"],
        y=res_triple_w[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=triangle_palette[label], opacity=1, size=dotsize, symbol='triangle-up'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Triangle: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_triple_w[mask]["key"],
            res_triple_w[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=2, col=3)
    
    
    
subfigs.update_yaxes(title_text="Decayed Weight", title_standoff=1)
subfigs.update_xaxes(title_text="Year", title_standoff=1)
subfigs.add_annotation(
    text="(a)",
    xref="paper", yref="paper",
    x=-0.028, y=1.02,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(b)",
    xref="paper", yref="paper",
    x=0.335, y=1.02,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(c)",
    xref="paper", yref="paper",
    x=0.695, y=1.02,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)

subfigs.add_annotation(
    text="(d)",
    xref="paper", yref="paper",
    x=-0.02, y=0.48,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(e)",
    xref="paper", yref="paper",
    x=0.3466666667, y=0.48,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(f)",
    xref="paper", yref="paper",
    x=0.7133333333, y=0.48,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)


subfigs.update_layout(
    template=template_type,
    showlegend=True,
    font=dict(family="Arial", size=7),
    margin={'t':10,'l':10,'b':10,'r':10}
)

subfigs.show()

In [None]:
subfigs.write_image(f"{path}/results/feature-only-KG/img/fig5.svg", width=cm_to_px(18), height=cm_to_px(12))