In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'📂 Current working directory: {path}')
print(f'💚 Today is {today}')
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))

import ss_api_call as ss
import graph

# Plotting
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

📂 Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
💚 Today is 2025-10-31


In [2]:
period_order = ["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"]

# Helper function to convert cm to pixels --------------------------------------------------
# write_image does not accept a 'unit' argument. Convert cm to pixels (assuming 96 DPI)
def cm_to_px(cm, dpi=96):
    return int(cm * dpi / 2.54)

# Fig 2. Descriptives

In [3]:
# Load all top degree and top strength data across periods
top_degree_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_degree"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_degree/{file}")
        df['period'] = period
        top_degree_all[period] = df
all_top_degree_df = pd.concat(top_degree_all.values(), ignore_index=True)


top_strength_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_strength"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_strength/{file}")
        df['period'] = period
        top_strength_all[period] = df
all_top_strength_df = pd.concat(top_strength_all.values(), ignore_index=True)

In [4]:
tri = pd.read_csv(f"{path}/results/feature-only-KG/triangle_counts_papers.csv")
pairs = pd.read_csv(f"{path}/results/feature-only-KG/pair_counts_perYear.csv")

In [5]:
from scipy import stats

def add_kendall_overlay(fig, df, value_col, period_col, row, col, period_order,
                        name="Kendall Tau-b", annotate_mean=True, overlay_idx=10, show_overlay_x=True, show_overlay_y=True):
    """
    Compute Kendall Tau-b correlations between consecutive periods and overlay
    them as a line plot with right-hand axis on a subplot.
    """

    # --- Compute Kendall Tau_b between consecutive periods ---
    tau_results = {}
    for i in range(len(period_order) - 1):
        p1, p2 = period_order[i], period_order[i + 1]
        df1 = df[df[period_col] == p1].copy()
        df2 = df[df[period_col] == p2].copy()

        # pick entity column
        if "node" in df.columns:
            key = "node"
        elif "names" in df.columns:
            key = "names"
        else:
            key = df.columns[0]

        # align by common entities
        common = set(df1[key]).intersection(df2[key])
        if len(common) < 2:
            tau_results[(p1, p2)] = np.nan
            continue

        common_list = list(common)
        df1 = df1.set_index(key).loc[common_list]
        df2 = df2.set_index(key).loc[common_list]

        df1["rank"] = df1[value_col].rank(method="average")
        df2["rank"] = df2[value_col].rank(method="average")

        tau = stats.kendalltau(df1["rank"], df2["rank"]).correlation
        tau_results[(p1, p2)] = tau

    # --- Prepare labels and τ values ---
    tau_labels = [f"T{i} vs. T{i+1}" for i in range(1, len(period_order))]
    tau_vals = [tau_results.get((period_order[i], period_order[i + 1]), np.nan)
                for i in range(len(period_order) - 1)]

    x_positions = [i + 0.5 for i in range(len(tau_labels))]

    # --- Add Kendall Tau line trace ---
    fig.add_trace(
        go.Scatter(
            x=x_positions,  # Numeric positions between categories
            y=tau_vals,
            mode="lines+markers",
            line=dict(color="black", dash="dashdot", width=1),
            marker=dict(color="black", size=3),
            name=name,
            hovertemplate="Comparison: %{text}<br>τ(b) = %{y:.3f}<extra></extra>",
            text=tau_labels,
            showlegend=(row == 1 and col == 1),
        ),
        row=row, col=col,
    )

    # --- Identify subplot domain ---
    subplot_index = (row - 1) * 3 + col
    xaxis_name = "xaxis" if subplot_index == 1 else f"xaxis{subplot_index}"
    yaxis_name = "yaxis" if subplot_index == 1 else f"yaxis{subplot_index}"

    x_domain = fig.layout[xaxis_name].domain
    y_domain = fig.layout[yaxis_name].domain

    # --- Use numeric overlay axis IDs (valid syntax) ---
    overlay_x = f"xaxis{overlay_idx}"
    overlay_y = f"yaxis{overlay_idx}"
    
    # show_overlay_x = (row == 1)  # Only show overlay x-axis in first row
    # show_overlay_y = (col == 3)  # Only show overlay y-axis in last column

    fig.update_layout({
        overlay_x: dict(
            domain=x_domain,
            anchor=f"y{overlay_idx}",
            overlaying=f"x{subplot_index if subplot_index > 1 else ''}",
            side="top",
            tickmode="array",
            tickvals=x_positions,  # Use period labels for positioning
            ticktext=tau_labels,
            showgrid=True,
            zeroline=False,
            categoryorder="array",
            range=[-0.5, len(period_order) - 0.5],
            showticklabels=show_overlay_x,
        ),
        overlay_y: dict(
            domain=y_domain,
            anchor=f"x{overlay_idx}",
            overlaying=f"y{subplot_index if subplot_index > 1 else ''}",
            side="right",
            title="Kendall τ(b)" if show_overlay_y else "",
            range=[-1, 1],
            tickformat=".2f",
            showticklabels=show_overlay_y,
            dtick = 0.5,
            title_standoff=2,
            ticklabelstandoff=2
            
        ),
    })

    # Assign last trace to overlay axes
    fig.data[-1].update(xaxis=f"x{overlay_idx}", yaxis=f"y{overlay_idx}")

    # --- Optional mean τ annotation ---
    if annotate_mean:
        tau_mean = np.nanmean(tau_vals)
        fig.add_annotation(
            text=f"⟨τ⟩ = {tau_mean:.2f}",
            xref=f"x{subplot_index if subplot_index > 1 else ''} domain",
            yref=f"y{subplot_index if subplot_index > 1 else ''} domain",
            x=0.02, y=0.98,
            showarrow=False,
            font=dict(size=7, color="black"),
            align="right"
        )

    return tau_results

In [6]:
import plotly.graph_objects as go
# Visualization settings ----------------------------------
template_type = "simple_white"
showlegend = False
dotsize = 4 # 20 when exporting it to html


# Load Datasets--------------------------------------------
# Just in case it is modified later :-)
deg_df = all_top_degree_df.copy()
pair_df = pairs.copy()
tri_df = tri.copy()
strength_df = all_top_strength_df.copy()

# strength_df = strength_df.drop_duplicates(subset=['node', 'period'])



deg_df["node"] = deg_df["node"].apply(graph.norm)
pair_df["from_name"] = pair_df["from_name"].apply(graph.norm)
pair_df["to_name"] = pair_df["to_name"].apply(graph.norm)
tri_df["names"] = tri_df["names"].apply(graph.norm)
pair_df["names"] = (pair_df["from_name"] + " - " + pair_df["to_name"]).apply(graph.norm)
strength_df["node"] = strength_df["node"].apply(graph.norm)


# Take only one paper per period for degree and strength (to avoid overplotting)
deg_df = deg_df.drop_duplicates(subset=['node', 'period'])
pair_df = pair_df.drop_duplicates(subset=['names', 'period'])
tri_df = tri_df.drop_duplicates(subset=['names', 'period'])


deg_df["period"] = pd.Categorical(deg_df["period"], categories=period_order, ordered=True)


# Choose which nodes to color; everything else will be grey ------------------------------
NODE_HIGHLIGHTS = {
    "Intergenerational Wealth Mobility and Inheritance": "R4",
    "Regression-based Measures": "M1",
    "Empirical Estimates and Determinants": "R2",
    "No dataset": "D13",
    "Panel/Longitudinal Surveys": "D1"
}

PAIR_HIGHLIGHTS = {
    'Regression‐based Measures - Intergenerational Wealth Mobility and Inheritance': "M1-R4",
    'Regression‐based Measures - Empirical Estimates and Determinants': "M1-R2",
    'Regression‐based Measures - Panel/Longitudinal Surveys': "M1-D1",
    'Panel/Longitudinal Surveys - Intergenerational Wealth Mobility and Inheritance': "D1-R4",
    'Panel/Longitudinal Surveys - Empirical Estimates and Determinants': "D1-R2"
}

TRIANGLE_HIGHLIGHTS = {
    '[Panel/Longitudinal Surveys, Regression‐based Measures, Empirical Estimates and Determinants]': "D1-M1-R2",
    '[Panel/Longitudinal Surveys, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D1-M1-R4",
    '[Linked Administrative Data, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D6-R4-M1",
    '[National Survey Data, Regression‐based Measures, Empirical Estimates and Determinants]': "D3-M1-R2",
    '[National Survey Data, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D3-M1-R4"
}


NODE_HIGHLIGHTS = {graph.norm(k): v for k, v in NODE_HIGHLIGHTS.items()}
PAIR_HIGHLIGHTS = {graph.norm(k): v for k, v in PAIR_HIGHLIGHTS.items()}
TRIANGLE_HIGHLIGHTS = {graph.norm(k): v for k, v in TRIANGLE_HIGHLIGHTS.items()}

deg_df["color_group"] = deg_df["node"].map(NODE_HIGHLIGHTS).fillna("Other")
pair_df["color_group"] = pair_df["names"].map(PAIR_HIGHLIGHTS).fillna("Other")
tri_df["color_group"] = tri_df["names"].map(TRIANGLE_HIGHLIGHTS).fillna("Other")
strength_df["color_group"] = strength_df["node"].map(NODE_HIGHLIGHTS).fillna("Other")


# Define color palette ---------------------------------------------------------------------
node_palette = {"R4": "#008A69",
              "M1": "#1964B0",
              "R2": "#E9DC6D",
              "D13":"#DB5829",
              "D1": "#882D71",
              "Other": "#DEDEDE"
             }

pair_palette = {
    "M1-R4": "#008A69",
    "M1-R2": "#1964B0",
    "M1-D1": "#E9DC6D",
    "D1-R4": "#DB5829",
    "D1-R2": "#882D71",
    "Other": "#DEDEDE"
}

triangle_palette = {
    "D1-M1-R2": "#008A69",
    "D1-M1-R4": "#1964B0",
    "D6-R4-M1": "#E9DC6D",
    "D3-M1-R2": "#DB5829",
    "D3-M1-R4": "#882D71",
    "Other": "#DEDEDE"
}

# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=3, column_widths=[0.33, 0.33, 0.33], horizontal_spacing=0.1,)


point_offsets_nodes = {
    "R4": -1,
    "M1": -0.5,
    "R2": 0.0,
    "D13": 0.5,
    "D1": 1
}
point_offsets_pairs= {
    "M1-R4": -1,
    "M1-R2": -0.5,
    "M1-D1": 0.0,
    "D1-R4": 0.5,
    "D1-R2": 1,
}

point_offsets_triplet = {
    "D1-M1-R2": -1,
    "D1-M1-R4": -0.5,
    "D6-R4-M1": 0.0,
    "D3-M1-R4": 0.5,
    "D3-M1-R2": 1,
}


# --- "Other" nodes first (no jitter) ---
mask_other = strength_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=strength_df.loc[mask_other, "period"],
    y=strength_df.loc[mask_other, "strength"] / 2,
    boxpoints='all',
    jitter=0.3,
    pointpos=0,
    whiskerwidth=0,
    line=dict(width=0),
    fillcolor='rgba(0,0,0,0)',
    marker=dict(color=node_palette["Other"], opacity=0.5, size=dotsize),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Node: %{customdata[0]}<br>"
        "Kind: %{customdata[1]}<br>"
        "Strength: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        strength_df.loc[mask_other, "node"],
        strength_df.loc[mask_other, "kind"],
        strength_df.loc[mask_other, "strength"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# --- Highlighted groups with jitter ---

for label in NODE_HIGHLIGHTS.values():
    mask = strength_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=strength_df.loc[mask, "period"],
        y=strength_df.loc[mask, "strength"] / 2,
        boxpoints='all',
        jitter=0.7,          # horizontal jitter
        pointpos=point_offsets_nodes[label],          # centered at category
        whiskerwidth=0,      # no whiskers
        line=dict(width=0),  # no box outline
        fillcolor='rgba(0,0,0,0)',  # transparent background
        marker=dict(
            color=node_palette[label],
            opacity=1.0,
            size=dotsize
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Node: %{customdata[0]}<br>"
            "Kind: %{customdata[1]}<br>"
            "Strength: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            strength_df.loc[mask, "node"],
            strength_df.loc[mask, "kind"],
            strength_df.loc[mask, "strength"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# Pairs

# --- "Other" pairs ---
mask_other = pair_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=pair_df.loc[mask_other, "period"],
    y=pair_df.loc[mask_other, "weight_sum"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=pair_palette["Other"],
        opacity=0.5,
        size=dotsize,
        symbol='pentagon'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Pair: %{customdata[0]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        pair_df.loc[mask_other, "names"],
        pair_df.loc[mask_other, "weight_sum"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# --- Highlighted pairs (jittered) ---
for label in PAIR_HIGHLIGHTS.values():
    mask = pair_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=pair_df.loc[mask, "period"],
        y=pair_df.loc[mask, "weight_sum"],
        boxpoints='all',
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=pair_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='pentagon'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Pair: %{customdata[0]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            pair_df.loc[mask, "names"],
            pair_df.loc[mask, "weight_sum"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)


# Triangles
# --- "Other" triangles ---
mask_other = tri_df["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=tri_df.loc[mask_other, "period"],
    y=tri_df.loc[mask_other, "n_papers"],
    boxpoints='all',
    jitter=0.3,            # no horizontal jitter for "Other"
    pointpos=0,          # centered at category
    whiskerwidth=0,      # no whiskers
    line=dict(width=0),  # no box outline
    fillcolor='rgba(0,0,0,0)',  # transparent background
    marker=dict(
        color=triangle_palette["Other"],
        opacity=0.2,
        size=dotsize,
        symbol='triangle-up'
    ),
    name="Other",
    legendgroup="Other",
    hovertemplate=(
        "Triangle: %{customdata[0]}<br>"
        "Count: %{y}<br><extra></extra>"
    ),
    customdata=np.stack((
        tri_df.loc[mask_other, "names"],
        tri_df.loc[mask_other, "n_papers"]
    ), axis=-1),
    showlegend=True
), row=1, col=3)

# --- Highlighted triangles (jittered) ---

for label in sorted(TRIANGLE_HIGHLIGHTS.values()):
    mask = tri_df["color_group"] == label
    subplots.add_trace(go.Box(
        x=tri_df.loc[mask, "period"],
        y=tri_df.loc[mask, "n_papers"],
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_triplet[label],
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        marker=dict(
            color=triangle_palette[label],
            opacity=1.0,
            size=dotsize,
            symbol='triangle-up'
        ),
        name=label,
        legendgroup=label,
        hovertemplate=(
            "Triangle: %{customdata[0]}<br>"
            "Count: %{y}<br><extra></extra>"
        ),
        customdata=np.stack((
            tri_df.loc[mask, "names"],
            tri_df.loc[mask, "n_papers"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=3)



strength_kendall_results = add_kendall_overlay(
    subplots, strength_df, value_col="strength", period_col="period",
    row=1, col=1, period_order=period_order, overlay_idx=10, show_overlay_x=True, show_overlay_y=False
)

pair_kendall_results = add_kendall_overlay(
    subplots, pair_df, value_col="weight_sum", period_col="period",
    row=1, col=2, period_order=period_order, overlay_idx=11, show_overlay_x=True, show_overlay_y=False
)

triangle_kendall_results = add_kendall_overlay(
    subplots, tri_df, value_col="n_papers", period_col="period",
    row=1, col=3, period_order=period_order, overlay_idx=12, show_overlay_x=True, show_overlay_y=True
)



subplots.update_xaxes(categoryorder="array", categoryarray=period_order)

# subplots.update_yaxes(row=1, col=1, title_text="Normalized Degree")
subplots.update_yaxes(row=1, col=2, title_text="Pair Count", title_standoff=1)
subplots.update_yaxes(row=1, col=3, title_text="Triangle Count", title_standoff=1)
subplots.update_yaxes(row=1, col=1, title_text="Node Count", title_standoff=1) # Strength/2


subplots.add_annotation(
    text="(a)",
    xref="paper", yref="paper",
    x=-0.03, y=1.02,
    showarrow=False,
    font=dict(size=8, color="black"),
    xanchor="left",
    align="left"
)
subplots.add_annotation(
    text="(b)",
    xref="paper", yref="paper",
    x=0.334433, y=1.02,
    showarrow=False,
    font=dict(size=8, color="black"),
    xanchor="left",
    align="left"
)
subplots.add_annotation(
    text="(c)",
    xref="paper", yref="paper",
    x=0.7000009, y=1.02,
    showarrow=False,
    font=dict(size=8, color="black"),
    xanchor="left",
    align="left"
)

subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':10,'b':2,'r':10}
)


In [101]:
# subplots.write_html(f"{path}/results/feature-only-KG/img/fig2.html",
#                     include_plotlyjs='cdn',
#                     full_html=True,
#                     auto_open=False)


subplots.write_image(f"{path}/results/feature-only-KG/img/fig2_2_jitter.svg",
                     width=cm_to_px(18), height=cm_to_px(6))

In [7]:
tri_df

Unnamed: 0,period,dt.name,m.name,rq.name,names,n_papers,color_group
0,-2000,No dataset,Decomposition / Structural Approaches,Theoretical and Structural Models,"[No dataset, Decomposition / Structural Approa...",4,Other
1,-2000,Administrative/Registry Data,Regression‐based Measures,Intergenerational Wealth Mobility and Inheritance,"[Administrative/Registry Data, Regression-base...",2,Other
2,-2000,Linked Administrative Data,Regression‐based Measures,Intergenerational Wealth Mobility and Inheritance,"[Linked Administrative Data, Regression-based ...",2,D6-R4-M1
3,-2000,No dataset,Regression‐based Measures,Empirical Estimates and Determinants,"[No dataset, Regression-based Measures, Empiri...",2,Other
4,-2000,Panel/Longitudinal Surveys,Regression‐based Measures,Empirical Estimates and Determinants,"[Panel/Longitudinal Surveys, Regression-based ...",2,D1-M1-R2
...,...,...,...,...,...,...,...
490,2021-2025,Panel/Longitudinal Surveys,Regression‐based Measures,"Mobility and Non-Income Outcomes (Health, Well...","[Panel/Longitudinal Surveys, Regression-based ...",0,Other
491,2021-2025,Opportunity Atlas,Regression‐based Measures,"Mobility and Non-Income Outcomes (Health, Well...","[Opportunity Atlas, Regression-based Measures,...",0,Other
492,2021-2025,Others_DataType,Rank‐based Measures,"Mobility and Non-Income Outcomes (Health, Well...","[Others_DataType, Rank-based Measures, Mobilit...",0,Other
493,2021-2025,Panel/Longitudinal Surveys,Rank‐based Measures,"Mobility and Non-Income Outcomes (Health, Well...","[Panel/Longitudinal Surveys, Rank-based Measur...",0,Other


In [18]:
cg_counts = (
    tri_df[tri_df['color_group'] != "Other"]
    .groupby(['period', 'n_papers'])
    .size()
    .reset_index(name='counts')
)

cg_counts = cg_counts[cg_counts['counts'] > 1]

color_group_list = {}
for _, row in cg_counts.iterrows():
    mask = (tri_df['period'] == row['period']) & (tri_df['n_papers'] == row['n_papers']) & (tri_df['color_group'] != "Other")
    color_groups = tri_df[mask]['color_group'].unique().tolist()
    color_group_list[row['period']] = ", ".join(color_groups)

color_group_list

Unnamed: 0,period,n_papers,n_groups
1,-2000,2,2
4,2001-2005,2,2
20,2021-2025,5,2
21,2021-2025,9,2


In [62]:
mark_overlaps(tri_df[tri_df['color_group'] != 'Other'], period_col="period", y_col="n_papers").value_counts()

False    19
True      8
Name: count, dtype: int64

# Fig 3. Betweenness

In [7]:
top_betweenness = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_betweenness"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_betweenness/{file}")
        df['period'] = period
        
        # --- Normalization ---
        n_nodes = df.shape[0]  
        if n_nodes > 2:
            factor = 2 / ((n_nodes - 1) * (n_nodes - 2))
            df['score_norm'] = df['score'] * factor
        else:
            df['score_norm'] = df['score']  # fallback if very small graph
        top_betweenness[period] = df
all_top_betweenness_df = pd.concat(top_betweenness.values(), ignore_index=True)

edge_betweenness = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/edge_betweenness"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{file}")
        df['period'] = period
        edge_betweenness[period] = df
all_edge_betweenness_df = pd.concat(edge_betweenness.values(), ignore_index=True)

In [8]:
# Visualization settings ----------------------------------
template_type = "simple_white"
showlegend = False
dotsize = 4 # 20 when exporting it to html


# Load Datasets--------------------------------------------
# Just in case it is modified later :-)
node_btw = all_top_betweenness_df.copy()
edge_btw = all_edge_betweenness_df.copy()

node_btw["node"] = node_btw["node"].apply(graph.norm)
edge_btw["u"] = edge_btw["u"].apply(graph.norm)
edge_btw["v"] = edge_btw["v"].apply(graph.norm)
edge_btw["names"] = (edge_btw["u"] + " - " + edge_btw["v"]).apply(graph.norm)

node_btw["period"] = pd.Categorical(node_btw["period"], categories=period_order, ordered=True)


# Choose which nodes to color; everything else will be grey ------------------------------
NODE_HIGHLIGHTS = {
    "Intergenerational Wealth Mobility and Inheritance": "R4",
    "Regression-based Measures": "M1",
    "Empirical Estimates and Determinants": "R2",
    "No dataset": "D13",
    "Panel/Longitudinal Surveys": "D1"
}

PAIR_HIGHLIGHTS = {
    'Regression‐based Measures - Intergenerational Wealth Mobility and Inheritance': "M1-R4",
    'Regression‐based Measures - Empirical Estimates and Determinants': "M1-R2",
    'Regression‐based Measures - Panel/Longitudinal Surveys': "M1-D1",
    'Panel/Longitudinal Surveys - Intergenerational Wealth Mobility and Inheritance': "D1-R4",
    'Panel/Longitudinal Surveys - Empirical Estimates and Determinants': "D1-R2"
}

TRIANGLE_HIGHLIGHTS = {
    '[Panel/Longitudinal Surveys, Regression‐based Measures, Empirical Estimates and Determinants]': "D1-M1-R2",
    '[Panel/Longitudinal Surveys, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D1-M1-R4",
    '[Linked Administrative Data, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D6-R4-M1",
    '[National Survey Data, Regression‐based Measures, Empirical Estimates and Determinants]': "D3-M1-R2",
    '[National Survey Data, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D3-M1-R4"
}


NODE_HIGHLIGHTS = {graph.norm(k): v for k, v in NODE_HIGHLIGHTS.items()}
PAIR_HIGHLIGHTS = {graph.norm(k): v for k, v in PAIR_HIGHLIGHTS.items()}
TRIANGLE_HIGHLIGHTS = {graph.norm(k): v for k, v in TRIANGLE_HIGHLIGHTS.items()}

node_btw["color_group"] = node_btw["node"].map(NODE_HIGHLIGHTS).fillna("Other")
edge_btw["color_group"] = edge_btw["names"].map(PAIR_HIGHLIGHTS).fillna("Other")


# Define color palette ---------------------------------------------------------------------
node_palette = {"R4": "#008A69",
              "M1": "#1964B0",
              "R2": "#E9DC6D",
              "D13":"#DB5829",
              "D1": "#882D71",
              "Other": "#DEDEDE"
             }

pair_palette = {
    "M1-R4": "#008A69",
    "M1-R2": "#1964B0",
    "M1-D1": "#E9DC6D",
    "D1-R4": "#DB5829",
    "D1-R2": "#882D71",
    "Other": "#DEDEDE"
}



# Create subplots ----------------------------------------------------------------
subplots = sp.make_subplots(rows=1, cols=2, 
                            column_widths=[0.5, 0.5], 
                            horizontal_spacing=0.15,)

# ------------------------------
# A) Plot degree distribution over time
# ------------------------------
# Add "Other" nodes first as box plots to subplot col 1
other_mask_nodes = node_btw["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=node_btw[other_mask_nodes]["period"],
    y=node_btw[other_mask_nodes]["score_norm"],
    name="Other",
    legendgroup="Other",
    marker=dict(color=node_palette["Other"], opacity=0.5, size=dotsize),
    line=dict(width=0),
    whiskerwidth=0,
    fillcolor='rgba(0,0,0,0)',
    boxpoints="all",
    jitter=0.3,
    pointpos=0,
    hovertemplate=
        "Node: %{customdata[0]}<br>" +
        "Kind: %{customdata[1]}<br>" +
        "Betweenness: %{y}<br><extra></extra>",
    customdata=np.stack((
        node_btw[other_mask_nodes]["node"],
        node_btw[other_mask_nodes]["kind"],
        node_btw[other_mask_nodes]["score_norm"]
    ), axis=-1),
    showlegend=True
), row=1, col=1)

# Add highlighted nodes in specified order as box plots to subplot col 1
for label in NODE_HIGHLIGHTS.values():
    mask = node_btw["color_group"] == label
    subplots.add_trace(go.Box(
        x=node_btw[mask]["period"],
        y=node_btw[mask]["score_norm"],
        name=label,
        legendgroup=label,
        marker=dict(color=node_palette[label], opacity=1, size=dotsize),
        whiskerwidth=0,
        line=dict(width=0),
        fillcolor='rgba(0,0,0,0)',
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_nodes[label],
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Betweenness: %{y}<br><extra></extra>",
        customdata=np.stack((
            node_btw[mask]["node"],
            node_btw[mask]["kind"],
            node_btw[mask]["score_norm"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# ------------------------------
# B) Edge Betweenness over time (box plots)
# ------------------------------

other_mask_pairs = edge_btw["color_group"] == "Other"
subplots.add_trace(go.Box(
    x=edge_btw[other_mask_pairs]["period"],
    y=edge_btw[other_mask_pairs]["edge_betweenness_weighted"],
    name="Other",
    legendgroup="Other",
    marker=dict(color=pair_palette["Other"], opacity=0.5, size=dotsize),
    line=dict(width=0),
    whiskerwidth=0,
    fillcolor='rgba(0,0,0,0)',
    boxpoints="all",
    jitter=0.3,
    pointpos=0,
    hovertemplate=
        "Pair: %{customdata[0]}<br>" +
        "Betweenness: %{y}<br>" +
        "<extra></extra>",
    customdata=np.stack((
        edge_btw[other_mask_pairs]["names"],
        edge_btw[other_mask_pairs]["edge_betweenness_weighted"]
    ), axis=-1),
    showlegend=True
), row=1, col=2)

# Add highlighted pairs in specified order as box plots to subplot col 2
for label in PAIR_HIGHLIGHTS.values():
    mask = edge_btw["color_group"] == label
    subplots.add_trace(go.Box(
        x=edge_btw[mask]["period"],
        y=edge_btw[mask]["edge_betweenness_weighted"],
        name=label,
        legendgroup=label,
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize),
        line=dict(width=0),
        whiskerwidth=0,
        fillcolor='rgba(0,0,0,0)',
        boxpoints="all",
        jitter=0.7,
        pointpos=point_offsets_pairs[label],
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Betweenness: %{y}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            edge_btw[mask]["names"],
            edge_btw[mask]["edge_betweenness_weighted"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)

 



degree_kendall_results = add_kendall_overlay(
    subplots, node_btw, value_col="score_norm", period_col="period",
    row=1, col=1, period_order=period_order, overlay_idx=10, show_overlay_x=True, show_overlay_y=False
)

pair_kendall_results = add_kendall_overlay(
    subplots, edge_btw, value_col="edge_betweenness_weighted", period_col="period",
    row=1, col=2, period_order=period_order, overlay_idx=11, show_overlay_x=True, show_overlay_y=True
)


subplots.update_xaxes(categoryorder="array", categoryarray=period_order)
subplots.update_yaxes(row=1, col=1, title_text="Node Betweenness", title_standoff=1)
subplots.update_yaxes(row=1, col=2, title_text="Edge Betweenness", title_standoff=1)


subplots.update_layout(
    template=template_type,
    showlegend=showlegend,
    xaxis=dict(categoryorder="array", categoryarray=period_order),
    font=dict(family="Arial", size=7),
    # title_text="Degree distribution per period"
    margin={'t':2,'l':2,'b':2,'r':2}
)

# Add panel labels "(a)" and "(b)" above each subplot
panel_labels = ['(a)', '(b)']
# paper x positions for two columns (small left margin and halfway)
x_positions = [-0.03, 0.544455]

existing = list(subplots.layout.annotations) if (hasattr(subplots.layout, "annotations") and subplots.layout.annotations is not None) else []
for lbl, x in zip(panel_labels, x_positions):
    existing.append(dict(
        text=lbl,
        x=x,
        xref='paper',
        xanchor='left',
        y=1.02,
        yref='paper',
        showarrow=False,
        font=dict(family='Arial', size=8, color='black')
    ))

subplots.update_layout(annotations=existing)


In [122]:
subplots.write_image(f"{path}/results/feature-only-KG/img/fig3_normed.svg",
                        width=cm_to_px(12), height=cm_to_px(6))

# Fig 4. Strength vs. Degree \& D/2*#Triplets vs. Betweenness

In [9]:
norm_degree_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/degree_normalized"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/degree_normalized/{file}")
        df['period'] = period
        norm_degree_all[period] = df
all_norm_degree_df = pd.concat(norm_degree_all.values(), ignore_index=True)

In [10]:
# Run Fig 2 before this.
strength_df = strength_df.sort_values(by="period", ascending=True)
deg_df = deg_df.sort_values(by="period", ascending=True)

subplot_titles = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]
subplot_titles = [title + " " + period for title, period in zip(subplot_titles, period_order*2)]

fig = sp.make_subplots(rows=4, cols=3, subplot_titles=subplot_titles, vertical_spacing=0.09)

for idx, period in enumerate(period_order):
    if period not in strength_df['period'].values and period not in deg_df['period'].values:
        continue

    period_data = pd.merge(
        strength_df[strength_df['period'] == period],
        deg_df[deg_df['period'] == period],
        on=['node', 'period', 'kind', 'color_group'],
        suffixes=('_strength', '_degree')
    )

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1
    
    # y values: use strength+1 so log axis is valid (no zeros)
    y_vals = period_data['strength'] + 1

    # Add scatter points (plotting strength+1 on log y-axis)
    fig.add_trace(go.Scatter(
        x=period_data['score'], 
        y=y_vals,
        mode='markers',
        marker=dict(
            size=4,
            color=[node_palette.get(cg, "#C7C7C7") for cg in period_data['color_group']]
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Degree: %{x}<br>" +
            "Strength: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['strength']), axis=-1),
        showlegend=False
    ), row=row, col=col)
    
    # Fit an OLS line in log-space: log10(strength+1) ~ score
    mask = (~period_data['score'].isna()) & (~period_data['strength'].isna())
    x = period_data.loc[mask, 'score'].to_numpy()
    y_log = np.log10(period_data.loc[mask, 'strength'] + 1).to_numpy()
    if len(x) > 1:
        z = np.polyfit(x, y_log, 1)
        x_trend = np.linspace(x.min(), x.max(), 100)
        # convert back from log10 to linear (strength+1)
        y_trend = 10 ** (np.polyval(z, x_trend))
    else:
        x_trend = np.array([])
        y_trend = np.array([])
    
    # Add trendline (on the same log y-axis)
    fig.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=1),
            showlegend=False,
            hoverinfo='skip',
        ),
        row=row, col=col
    )

    # Set y-axis to log for this subplot (strength plotted as strength+1)
    fig.update_yaxes(type="log", row=row, col=col)


# Normalized Degree vs Triplets ------------------------------------------------------------------

norm_plt = all_norm_degree_df.sort_values(by=["period","degree"], ascending=[True,False])
norm_plt['color_group'] = norm_plt['node'].map(NODE_HIGHLIGHTS).fillna("Other")

for idx, period in enumerate(period_order):
    if period not in norm_plt['period'].values:
        continue

    period_data = norm_plt[norm_plt['period'] == period]

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 3
    
    # Add scatter points
    fig.add_trace(go.Scatter(
        # x=period_data['triplets'], 
        x=period_data['degree'],
        y=period_data['normalized_degree'],
        mode='markers', 
        marker=dict(
            size=4,
            color=[node_palette.get(cg, "#C7C7C7") for cg in period_data['color_group']],
            symbol="circle"
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "#Triplets: %{x}<br>" +
            "Normalized_degree: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['normalized_degree']), axis=-1),
        showlegend=False
    ), row=row, col=col)  # Added row and col here
    
    # Fit a powerlaw: normalized_degree = a * triplets^b
    # Only use points with triplets > 0 and normalized_degree > 0
    # mask = (period_data['triplets'] > 0) & (period_data['normalized_degree'] > 0)
    mask = (period_data['degree'] > 0) & (period_data['normalized_degree'] > 0)
    # x = period_data.loc[mask, 'triplets'].to_numpy()
    x = period_data.loc[mask, 'degree'].to_numpy()
    y = period_data.loc[mask, 'normalized_degree'].to_numpy()
    if len(x) > 1 and len(y) > 1:
        # log-log fit: log(y) = log(a) + b*log(x)
        logx = np.log(x)
        logy = np.log(y)
        # b, loga = np.polyfit(logx, y, 1)
        b, loga = np.polyfit(logx, logy, 1)
        a = np.exp(loga)
        x_trend = np.linspace(x.min(), x.max(), 100)
        y_trend = a * x_trend ** b
    else:
        x_trend = np.array([])
        y_trend = np.array([])
        
    # Add trendline
    fig.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=1),
            showlegend=False,
            hoverinfo='skip',
            
        ),
        row=row, col=col
    )

fig.update_layout(
    template=template_type, 
    showlegend=showlegend,
    font=dict(family="Arial", size=7),
    margin={'t':5,'l':5,'b':5,'r':5},
)

# 1) Fix subplot title sizing (safe form)
fig.update_annotations(font=dict(size=7))

# 2) Apply axis updates by looping rows/cols (no lists)
for r in (1, 2):
    fig.update_yaxes(range=[0, 3], row=r)
    for c in (1, 2, 3):
        fig.update_xaxes(range=[0, 20], row=r, col=c)
        if c == 1:
            fig.update_yaxes(title_text="Strength+1", ticklabelstandoff=1, title_standoff=1, row=r, col=c)
for c in (1, 2, 3):
    fig.update_xaxes(title_text="Degree", row=2, col=c)

# 3) Log axes: double-check ranges (these are log10 units!)
# If you really want triplets in [1, 100] and normalized_degree in [0.1, 1]:
for r in (3, 4):
    if r== 4:
        fig.update_xaxes(title_text="Degree", row=r)
    for c in (1, 2, 3):
        fig.update_xaxes(type="log", autorange=False, range=[0, 2], row=r, col=c)
        # fig.update_xaxes(type="log", autorange=False, range=[0, 2], row=r, col=c)
        fig.update_yaxes(type="log", title_text=("degree/2#Triplets" if c == 1 else None),
                         range=[-1, 0], ticklabelstandoff=1,title_standoff=1, row=r, col=c)

# 4) Titles/labels spacing (optional)
fig.update_xaxes(title_standoff=0)



fig.show()


In [13]:
fig.write_image(f"{path}/results/feature-only-KG/img/fig4.svg",
                     width=cm_to_px(18), height=cm_to_px(11.5))

# Fig 6. Betweenness vs. Degree

In [22]:
short_names = {
    "Regression‐based Measures": "M1",
    "Rank‐based Measures": "M2",
    "Transition Matrix / Probability Measures": "M3",
    "Absolute Mobility Measures": "M4",
    "Multigenerational Measures": "M5",
    "Decomposition / Structural Approaches": "M6",
    "Non‐parametric Approaches": "M7",
    "Others_Measure": "M9",
    "Panel/Longitudinal Surveys": "D1",
    "Administrative/Registry Data": "D2",
    "National Survey Data": "D3",
    "Opportunity Atlas": "D4",
    "Natural/Experimental Data": "D5",
    "Linked Administrative Data": "D6",
    "International Panel Data": "D7",
    "Rich List Data": "D8",
    "University/Institution Data" : "D9",
    "Pseudo-Panel/Household Budget Survey": "D10",
    "Archival/Historical Data": "D11",
    "Big Data": "D12",
    "No dataset": "D13",
    "Others_DataType": "D14",
    "Measurement and Methodological Advances": "R1",
    "Empirical Estimates and Determinants": "R2",
    "Policy, Institutional, and Geographic Impacts": "R3",
    "Intergenerational Wealth Mobility and Inheritance": "R4",
    "Demographic Differences in Mobility (Race, Gender, etc.)": "R5",
    "Mobility and Non-Income Outcomes (Health, Wellbeing, etc.)": "R6",
    "Theoretical and Structural Models": "R7",
    "Perceptions of Mobility and Attitudes": "R8",
    "Others_RqType": "R9",
}

In [23]:
short_names = {graph.norm(k): v for k, v in short_names.items()}
deg_df['node'] = deg_df['node'].apply(graph.norm)
deg_df['short_name'] = deg_df['node'].map(short_names)


In [24]:
deg_df['short_name'].unique()

array(['R1', 'D9', 'R9', 'R8', 'D4', 'D5', 'D10', 'R6', 'R5', 'D7', 'D3',
       'M2', 'M7', 'M5', 'R3', 'D13', 'D6', 'D14', 'D11', 'R2', 'R7',
       'M9', 'M4', 'D1', 'D2', 'M3', 'D12', 'M6', 'R4', 'M1'],
      dtype=object)

In [30]:
# Run Fig 2 before this.
strength_df = strength_df.sort_values(by="period", ascending=True)
deg_df = deg_df.sort_values(by="period", ascending=True)
node_btw = node_btw.sort_values(by="period", ascending=True)

deg_df['deg_rank_percentile'] = deg_df.groupby('period')['score'].rank(pct=True, ascending=False)
node_btw['btw_rank_percentile'] = node_btw.groupby('period')['score'].rank(pct=True, ascending=False)

subplot_titles = ["a", "b", "c", "d", "e", "f"]
subplot_titles = [title + " " + period for title, period in zip(subplot_titles, period_order)]

fig = sp.make_subplots(rows=2, cols=3, subplot_titles=subplot_titles, 
                    #    vertical_spacing=0.09
                    )

for idx, period in enumerate(period_order):
    if period not in strength_df['period'].values and period not in node_btw['period'].values:
        continue

    period_data = pd.merge(
        # strength_df[strength_df['period'] == period],
        deg_df[deg_df['period'] == period],
        node_btw[node_btw['period'] == period],
        on=['node', 'period', 'kind', 'color_group'],
        suffixes=('_degree', '_betweenness')
    )
    

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1

    for node in period_data['node']:
        node_data = period_data[(period_data['node'] == node)]
        if node_data[(node_data['deg_rank_percentile'] < 0.5) & (node_data['btw_rank_percentile'] > 0.5)].shape[0] > 0:
            fig.add_trace(go.Scatter(
                x=node_data['deg_rank_percentile'],
                y=node_data['btw_rank_percentile'],
                mode='markers+text',
            marker=dict(
                size=4,
                color=[node_palette.get(cg, "#C7C7C7") for cg in node_data['color_group']]
            ),
            text = node_data['short_name'],
            textposition="top center",
            hovertemplate=
                "Node: %{customdata[0]}<br>" +
                "Kind: %{customdata[1]}<br>" +
                "Degree: %{x}<br>" +
                "Strength: %{customdata[2]}<br>" +
                "<extra></extra>",
            customdata=np.stack((node_data['node'], node_data['kind'], node_data['score_betweenness']), axis=-1),
            showlegend=False
        ), row=row, col=col)
    
        else:
            fig.add_trace(go.Scatter(
                x=node_data['deg_rank_percentile'],
                y=node_data['btw_rank_percentile'],
                mode='markers',
                marker=dict(
                    size=4,
                    color=[node_palette.get(cg, "#C7C7C7") for cg in node_data['color_group']]
                ),
                hovertemplate=
                    "Node: %{customdata[0]}<br>" +
                    "Kind: %{customdata[1]}<br>" +
                    "Degree: %{x}<br>" +
                    "Strength: %{customdata[2]}<br>" +
                    "<extra></extra>",
                customdata=np.stack((node_data['node'], node_data['kind'], node_data['score_betweenness']), axis=-1),
                showlegend=False
            ), row=row, col=col)
        
    
    # Add vertical and horizontal reference lines
    fig.add_shape(
        type="line",
        x0=0.5, x1=0.5,
        y0=0,
        y1=1,
        line=dict(color="black", width=1, dash="dot"),
        row=row, col=col
    )

    fig.add_shape(
        type="line",
        x0=0,
        x1=1,
        y0=0.5, y1=0.5,
        line=dict(color="black", width=1, dash="dot"),
        row=row, col=col
    )
    
fig.update_layout(
    template=template_type, 
    showlegend=showlegend,
    font=dict(family="Arial", size=7),
    # margin={'t':5,'l':5,'b':5,'r':5},
)


fig.update_annotations(font=dict(size=7))
fig.update_yaxes(col=1,title_text="Node Betweenness Percentile", title_standoff=1, ticklabelstandoff=1, range=[0,1])

fig.update_xaxes(row=2, title_text="Degree Rank Percentile", title_standoff=10, ticklabelstandoff=1, range=[0,1])



fig.show()








In [32]:
fig.write_image(f"{path}/results/feature-only-KG/img/fig6.svg",
                     width=cm_to_px(18), height=cm_to_px(8))

# Fig 5. Resurgence

## Nodes

In [33]:
res_node = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_node_approach_allyears2.csv")
res_node_w = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_node_approach_papercounts_allyears2.csv")
len(res_node), len(res_node_w)

(1500, 1500)

## Edges

In [34]:
res_pair = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_edge_approach_allyears2.csv")
res_pair_w = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_edge_approach_papercounts_allyears2.csv")
len(res_pair), len(res_pair_w)

(16800, 16800)

## Triplets

In [35]:
res_triple = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_triplet_approach_allyears2.csv")
res_triple_w = pd.read_csv(f"{path}/results/feature-only-KG/decaying_weights_over_time_triplet_approach_papercounts_allyears2.csv")
len(res_triple), len(res_triple_w)

(7900, 7900)

In [36]:
res_triple

Unnamed: 0,key,year,identifier,measure,datatype,rqtype,paper_count,ew
0,Absolute Mobility Measures | Administrative/Re...,1976,filled,Absolute Mobility Measures,Administrative/Registry Data,Empirical Estimates and Determinants,0,0.000000
1,Absolute Mobility Measures | Administrative/Re...,1977,filled,Absolute Mobility Measures,Administrative/Registry Data,Empirical Estimates and Determinants,0,0.000000
2,Absolute Mobility Measures | Administrative/Re...,1978,filled,Absolute Mobility Measures,Administrative/Registry Data,Empirical Estimates and Determinants,0,0.000000
3,Absolute Mobility Measures | Administrative/Re...,1979,filled,Absolute Mobility Measures,Administrative/Registry Data,Empirical Estimates and Determinants,0,0.000000
4,Absolute Mobility Measures | Administrative/Re...,1980,filled,Absolute Mobility Measures,Administrative/Registry Data,Empirical Estimates and Determinants,0,0.000000
...,...,...,...,...,...,...,...,...
7895,Transition Matrix / Probability Measures | Uni...,2021,filled,Transition Matrix / Probability Measures,University/Institution Data,Empirical Estimates and Determinants,0,0.757858
7896,Transition Matrix / Probability Measures | Uni...,2022,filled,Transition Matrix / Probability Measures,University/Institution Data,Empirical Estimates and Determinants,0,0.659754
7897,Transition Matrix / Probability Measures | Uni...,2023,filled,Transition Matrix / Probability Measures,University/Institution Data,Empirical Estimates and Determinants,0,0.574349
7898,Transition Matrix / Probability Measures | Uni...,2024,filled,Transition Matrix / Probability Measures,University/Institution Data,Empirical Estimates and Determinants,0,0.500000


In [37]:
dotsize = 3
others_dotsize = 1
others_line_width = 0.5
highlight_line_width = 1

# Choose which nodes to color; everything else will be grey ------------------------------
NODE_HIGHLIGHTS = {
    "Intergenerational Wealth Mobility and Inheritance": "R4",
    "Regression-based Measures": "M1",
    "Empirical Estimates and Determinants": "R2",
    "No dataset": "D13",
    "Panel/Longitudinal Surveys": "D1"
}

PAIR_HIGHLIGHTS = {
    'Regression‐based Measures - Intergenerational Wealth Mobility and Inheritance': "M1-R4",
    'Regression‐based Measures - Empirical Estimates and Determinants': "M1-R2",
    'Regression‐based Measures - Panel/Longitudinal Surveys': "M1-D1",
    'Panel/Longitudinal Surveys - Intergenerational Wealth Mobility and Inheritance': "D1-R4",
    'Panel/Longitudinal Surveys - Empirical Estimates and Determinants': "D1-R2"
}

TRIANGLE_HIGHLIGHTS = {
    '[Panel/Longitudinal Surveys, Regression‐based Measures, Empirical Estimates and Determinants]': "D1-M1-R2",
    '[Panel/Longitudinal Surveys, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D1-M1-R4",
    '[Linked Administrative Data, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D6-R4-M1",
    '[National Survey Data, Regression‐based Measures, Empirical Estimates and Determinants]': "D3-M1-R2",
    '[National Survey Data, Regression‐based Measures, Intergenerational Wealth Mobility and Inheritance]': "D3-M1-R4"
}

NODE_HIGHLIGHTS = {graph.norm(k): v for k, v in NODE_HIGHLIGHTS.items()}
PAIR_HIGHLIGHTS = {graph.norm(k): v for k, v in PAIR_HIGHLIGHTS.items()}
TRIANGLE_HIGHLIGHTS = {graph.norm(k): v for k, v in TRIANGLE_HIGHLIGHTS.items()}

res_node["key"] = res_node["key"].apply(graph.norm)
res_node["color_group"] = res_node["key"].map(NODE_HIGHLIGHTS).fillna("Other")

res_pair['key'] = res_pair.apply(lambda r: f"{r['key1']} - {r['key2']}", axis=1)
res_pair["key"] = res_pair['key'].apply(graph.norm)
res_pair["color_group"] = res_pair["key"].map(PAIR_HIGHLIGHTS).fillna("Other")

res_triple['key'] = res_triple.apply(lambda r: f"[{r['datatype']}, {r['measure']}, {r['rqtype']}]",axis=1)
res_triple["key"] = res_triple['key'].apply(graph.norm)
res_triple["color_group"] = res_triple["key"].map(TRIANGLE_HIGHLIGHTS).fillna("Other")

res_node_w['key'] = res_node_w['key'].apply(graph.norm)
res_node_w["color_group"] = res_node_w["key"].map(NODE_HIGHLIGHTS).fillna("Other") 

res_pair_w['key'] = res_pair_w.apply(lambda r: f"{r['key1']} - {r['key2']}", axis=1)
res_pair_w["key"] = res_pair_w['key'].apply(graph.norm)
res_pair_w["color_group"] = res_pair_w["key"].map(PAIR_HIGHLIGHTS).fillna("Other")

res_triple_w['key'] = res_triple_w.apply(lambda r: f"[{r['datatype']}, {r['measure']}, {r['rqtype']}]",axis=1)
res_triple_w["key"] = res_triple_w['key'].apply(graph.norm)
res_triple_w["color_group"] = res_triple_w["key"].map(TRIANGLE_HIGHLIGHTS).fillna("Other")  

triangle_palette = {
    "D1-M1-R2": "#008A69",
    "D1-M1-R4": "#1964B0",
    "D6-R4-M1": "#E9DC6D",
    "D3-M1-R2": "#DB5829",
    "D3-M1-R4": "#882D71",
    "Other": "#DEDEDE"
}


res_node = res_node.sort_values(by="year", ascending=True)
res_pair = res_pair.sort_values(by="year", ascending=True)
res_triple = res_triple.sort_values(by="year", ascending=True)
res_node_w = res_node_w.sort_values(by="year", ascending=True)
res_pair_w = res_pair_w.sort_values(by="year", ascending=True)
res_triple_w = res_triple_w.sort_values(by="year", ascending=True)

subfigs = sp.make_subplots(rows=2, cols=3)

# ---------------------------------------------------
# A) Nodes - Unweighted decay weight over time
# ---------------------------------------------------
for label in ["Other"]:
    others = (res_node[res_node["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=node_palette[label], opacity=0.5, size=others_dotsize, symbol='circle'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Node: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=1, col=1)
        
for label in NODE_HIGHLIGHTS.values():
    mask = res_node["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_node[mask]["year"],
        y=res_node[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=node_palette[label], opacity=1, size=dotsize, symbol='circle'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_node[mask]["key"],
            res_node[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=1)

# ---------------------------------------------------
# B) Pairs - Unweighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_pair[res_pair["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=pair_palette[label], opacity=0.5, size=others_dotsize, symbol='pentagon'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Pair: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=1, col=2)

for label in PAIR_HIGHLIGHTS.values():
    mask = res_pair["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_pair[mask]["year"],
        y=res_pair[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize, symbol='pentagon'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_pair[mask]["key"],
            res_pair[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=2)    


# ---------------------------------------------------
# C) Triangles - Unweighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_triple[res_triple["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=triangle_palette[label], opacity=0.5, size=others_dotsize, symbol='triangle-up'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Triangle: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=1, col=3)
        

for label in TRIANGLE_HIGHLIGHTS.values():
    mask = res_triple["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_triple[mask]["year"],
        y=res_triple[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=triangle_palette[label], opacity=1, size=dotsize, symbol='triangle-up'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Triangle: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_triple[mask]["key"],
            res_triple[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=1, col=3)

# ---------------------------------------------------
# D) Nodes - Weighted decay weight over time
# ---------------------------------------------------
for label in ["Other"]:
    others = (res_node_w[res_node_w["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=node_palette[label], opacity=0.5, size=others_dotsize, symbol='circle'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Node: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=2, col=1)
        
for label in NODE_HIGHLIGHTS.values():
    mask = res_node_w["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_node_w[mask]["year"],
        y=res_node_w[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=node_palette[label], opacity=1, size=dotsize, symbol='circle'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_node_w[mask]["key"],
            res_node_w[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=2, col=1)

# ---------------------------------------------------
# E) Pairs - Weighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_pair_w[res_pair_w["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=pair_palette[label], opacity=0.5, size=others_dotsize, symbol='pentagon'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Pair: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=2, col=2)

for label in PAIR_HIGHLIGHTS.values():
    mask = res_pair_w["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_pair_w[mask]["year"],
        y=res_pair_w[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=pair_palette[label], opacity=1, size=dotsize, symbol='pentagon'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Pair: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_pair_w[mask]["key"],
            res_pair_w[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=2, col=2)    


# ---------------------------------------------------
# F) Triangles - Weighted decay weight over time
# ---------------------------------------------------

for label in ["Other"]:
    others = (res_triple_w[res_triple_w["color_group"] == "Other"].sort_values(["key", "year"]))
    for i, (nm, df_) in enumerate(others.groupby("key", sort=False)):
        subfigs.add_trace(go.Scatter(
            x=df_["year"],
            y=df_["ew"],
            mode='markers+lines',
            marker=dict(color=triangle_palette[label], opacity=0.5, size=others_dotsize, symbol='triangle-up'),
            line = dict(width=others_line_width),
            name=label,
            legendgroup=label,
            hovertemplate=
                "Triangle: %{customdata[0]}<br>" +
                "Decayed Weight: %{y}<br>" +
                "Year: %{x}<br>" +
                "<extra></extra>",
            customdata=np.stack((
                df_["key"],
                df_["ew"]
            ), axis=-1),
            showlegend=False
    ), row=2, col=3)
        

for label in TRIANGLE_HIGHLIGHTS.values():
    mask = res_triple_w["color_group"] == label
    subfigs.add_trace(go.Scatter(
        x=res_triple_w[mask]["year"],
        y=res_triple_w[mask]["ew"],
        mode='markers+lines',
        marker=dict(color=triangle_palette[label], opacity=1, size=dotsize, symbol='triangle-up'),
        line = dict(width = highlight_line_width),
        name=label,
        legendgroup=label,
        hovertemplate=
            "Triangle: %{customdata[0]}<br>" +
            "Decayed Weight: %{y}<br>" +
            "Year: %{x}<br>" +
            "<extra></extra>",
        customdata=np.stack((
            res_triple_w[mask]["key"],
            res_triple_w[mask]["ew"]
        ), axis=-1),
        showlegend=True
    ), row=2, col=3)
    
    
    
subfigs.update_yaxes(title_text="Decayed Weight", title_standoff=1)
subfigs.update_xaxes(title_text="Year", title_standoff=1)
subfigs.add_annotation(
    text="(a)",
    xref="paper", yref="paper",
    x=-0.028, y=1.02,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(b)",
    xref="paper", yref="paper",
    x=0.335, y=1.02,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(c)",
    xref="paper", yref="paper",
    x=0.695, y=1.02,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)

subfigs.add_annotation(
    text="(d)",
    xref="paper", yref="paper",
    x=-0.02, y=0.48,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(e)",
    xref="paper", yref="paper",
    x=0.3466666667, y=0.48,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)
subfigs.add_annotation(
    text="(f)",
    xref="paper", yref="paper",
    x=0.7133333333, y=0.48,
    showarrow=False,
    font=dict(size=7, color="black"),
    xanchor="left",
    align="left"
)


subfigs.update_layout(
    template=template_type,
    showlegend=True,
    font=dict(family="Arial", size=7),
    margin={'t':10,'l':10,'b':10,'r':10}
)

subfigs.show()

In [140]:
subfigs.write_image(f"{path}/results/feature-only-KG/img/fig5.svg", width=cm_to_px(18), height=cm_to_px(12))

In [147]:
subfigs.write_html(f"{path}/results/feature-only-KG/img/fig5.html")