# 0. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'üìÇ Current working directory: {path}')
print(f'üíö Today is {today}')
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))
import ss_api_call as ss

üìÇ Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
üíö Today is 2025-10-10


# 1. Feature-Only KG in Neo4j

In [2]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

In [3]:
df.isna().sum()

id                           0
title                        0
year                         0
doi                         87
landing_page                 1
abstract_inverted_index     33
language                     0
is_oa                        0
oa_status                    0
oa_link                    295
abstract                    33
abstract_sm                104
authors_sm                 508
domain                       2
sort_gpt_1                  34
sort_gpt_2                 516
sort_gpt4o_1                33
sort_gpt4o_2               516
cited_by                   139
len_cited_by                 0
ref_count                    0
cited_by_count               0
Q1                           0
Q1_1                         0
Q2                           0
Q2_1                        62
Q2_2                       555
Q3                          55
Q4                         538
abs                          0
index                        0
category_n1                  0
measure 

In [4]:
df['period'] = pd.cut(df['year'], bins=[1900, 2000, 2005, 2010, 2015, 2020, 2025], right=True, labels=["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"])
df['period'].value_counts().sort_index()

period
-2000         23
2001-2005     23
2006-2010     65
2011-2015    141
2016-2020    183
2021-2025    182
Name: count, dtype: int64

In [7]:
import pandas as pd
from neo4j import GraphDatabase

# --- Connect ---
driver = GraphDatabase.driver("bolt://localhost:7690", auth=("neo4j", "your_password"))

# Normalize categories
df['category_1'] = df['category_1'].replace({'Others': 'Others_Measure'})
df['category_2'] = df['category_2'].replace({'Others': 'Others_Measure'})
df['data_cat']   = df['data_cat'].replace({'Others': 'Others_DataType'})
df['rq_cat']     = df['rq_cat'].replace({'Others': 'Others_RqType'})

def safe_str(val):
    if pd.isna(val) or str(val).strip().lower() in {"", "nan", "none"}:
        return None
    return str(val).strip()

# --- Constraints ---
with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Measure)   REQUIRE m.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (ds:DataType) REQUIRE ds.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:RqType)    REQUIRE r.name IS UNIQUE")

# --- Collect rows ---
rows = []
for _, row in df.iterrows():
    m1       = safe_str(row.get("category_1"))
    # m2       = safe_str(row.get("category_2"))
    datatype = safe_str(row.get("data_cat"))
    rqtype   = safe_str(row.get("rq_cat"))
    period = row.get("period")
    paper_id = safe_str(row.get("id"))

    if not all([m1, datatype, rqtype, period]):
        continue  # skip incomplete rows

    rows.append({
        "m1": m1,
        # "m2": m2,
        "datatype": datatype,
        "rqtype": rqtype,
        "period": period,
        "paper_id": paper_id
    })

# --- Cypher with undirected edges ---
cypher = """
UNWIND $rows AS row

MERGE (m1:Measure {name: row.m1})
MERGE (ds:DataType {name: row.datatype})
MERGE (rq:RqType {name: row.rqtype})

// Measure -- DataType
MERGE (m1)-[r1:CO_MEASURE_DATATYPE {period: row.period}]-(ds)
  ON CREATE SET r1.w = 1, r1.papers=[row.paper_id]
  ON MATCH  SET r1.w = r1.w + 1, r1.papers = apoc.coll.toSet(coalesce(r1.papers, []) + row.paper_id)

// DataType -- RqType
MERGE (ds)-[r2:CO_DATATYPE_RQTYPE {period: row.period}]-(rq)
  ON CREATE SET r2.w = 1, r2.papers=[row.paper_id]
  ON MATCH  SET r2.w = r2.w + 1, r2.papers = apoc.coll.toSet(coalesce(r2.papers, []) + row.paper_id)

// RqType -- Measure
MERGE (rq)-[r3:CO_RQTYPE_MEASURE {period: row.period}]-(m1)
  ON CREATE SET r3.w = 1, r3.papers=[row.paper_id]
  ON MATCH  SET r3.w = r3.w + 1, r3.papers = apoc.coll.toSet(coalesce(r3.papers, []) + row.paper_id)

"""

with driver.session() as session:
    if rows:
        session.run(cypher, rows=rows)

driver.close()


# 2. Centrality Measures

In [119]:
# pip install neo4j pandas
from neo4j import GraphDatabase
import pandas as pd

# --- CONFIG ---
NEO4J_URI  = "bolt://localhost:7690"
NEO4J_AUTH = ("neo4j", "your_password")
GRAPH_NAME = "features"

REL_TYPES = [
    "CO_MEASURE_DATATYPE",
    "CO_DATATYPE_RQTYPE",
    "CO_RQTYPE_MEASURE"
]

# --- UTILS ---
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

def run_df(cypher, params=None):
    """Run Cypher and return pandas DataFrame."""
    with driver.session() as s:
        result = s.run(cypher, params or {})
        data = result.data()
    return pd.DataFrame(data)

def run(cypher, params=None):
    with driver.session() as s:
        s.run(cypher, params or {})
        
def graph_name_for_period(period: str) -> str:
    # sanitize if periods contain spaces etc.; adjust if needed
    return f"{GRAPH_NAME}__{period}"

# --- 1) GDS PROJECTION (UNDIRECTED, WEIGHTED) ---
def ensure_graph_projection():
    # Drop if exists
    run("CALL gds.graph.drop($name, false) YIELD graphName", {"name": GRAPH_NAME})

    cypher = """
    CALL gds.graph.project.cypher(
      $name,
      // Nodes
      'MATCH (n:Measure|DataType|RqType)
       RETURN id(n) AS id, labels(n) AS labels',
      // Relationships (all three types), undirected, with w and inv_w as top-level props
      'MATCH (a)-[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-(b)
       RETURN id(a) AS source,
              id(b) AS target,
              type(r) AS type,
              coalesce(r.w, 1.0) AS w,
              CASE WHEN coalesce(r.w,0) > 0 THEN 1.0 / r.w ELSE 1e12 END AS inv_w,
              "UNDIRECTED" AS orientation'
    )
    YIELD graphName, nodeCount, relationshipCount
    """
    df = run_df(cypher, {"name": GRAPH_NAME})
    print(df)


def period_graph_projection(period=str):
    name = graph_name_for_period(period)
    # Drop if exists
    run("CALL gds.graph.drop($name, false) YIELD graphName", {"name": name})

    cypher = """
    CALL gds.graph.project.cypher(
      $name,
      // Nodes
      'MATCH (n:Measure|DataType|RqType)
       RETURN id(n) AS id, labels(n) AS labels',
      // Relationships (all three types), undirected, with w and inv_w as top-level props
      'MATCH (a)-[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-(b)
       WHERE r.period = $period
       RETURN id(a) AS source,
              id(b) AS target,
              type(r) AS type,
              coalesce(r.w, 1.0) AS w,
              CASE WHEN coalesce(r.w,0) > 0 THEN 1.0 / r.w ELSE 1e12 END AS inv_w,
              "UNDIRECTED" AS orientation',
      { parameters: {period: $period} }
    )
    YIELD graphName, nodeCount, relationshipCount
    """
    df = run_df(cypher, {"name": name, "period": period})
    print(df)


# --- 2) METRICS (STREAM) ---
def top_degree(name: str):
    cypher = f"""
    CALL gds.degree.stream($name)
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": name})

def top_strength(name: str):
    cypher = f"""
    CALL gds.degree.stream($name, {{relationshipWeightProperty:'w'}})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score AS strength
    ORDER BY strength DESC
    """
    return run_df(cypher, {"name": name})

def top_betweenness(name: str, weight_prop: str | None = None):
    opts = "{}" if weight_prop is None else f"{{relationshipWeightProperty:'{weight_prop}'}}"
    cypher = f"""
    CALL gds.betweenness.stream($name, {opts})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": name})

def hops_count(name: str):
    cypher = f"""
    CALL gds.allShortestPaths.stream($name)
    YIELD sourceNodeId, targetNodeId, distance
    RETURN gds.util.asNode(sourceNodeId).name AS source,
        gds.util.asNode(targetNodeId).name AS target,
        distance AS hops
    ORDER BY hops DESC
    """
    return run_df(cypher, {"name": name})

# def louvain():
#     cypher = f"""
#     CALL gds.louvain.stream($name, {{relationshipWeightProperty:'w'}})
#     YIELD nodeId, communityId
#     WITH communityId, gds.util.asNode(nodeId) AS n
#     RETURN communityId,
#            collect(n.name)[0..10] AS sample_members,
#            count(*) AS size
#     ORDER BY size DESC, communityId ASC
#     """
#     return run_df(cypher, {"name": GRAPH_NAME})



In [20]:
if __name__ == "__main__":
    ensure_graph_projection()
    top_degree_df = top_degree(name=GRAPH_NAME)
    top_strength_df = top_strength(name=GRAPH_NAME)
    top_betweenness_df = top_betweenness(name=GRAPH_NAME, weight_prop='inv_w')
    hops_count_df = hops_count(name=GRAPH_NAME)
    # louvain_df = louvain()
    top_betweenness_noweight_df = top_betweenness(name=GRAPH_NAME, weight_prop=None)

    # Triplets to DataFrame / CSV
    # trips = get_triplets(limit=50)
    # print("\nSample triplets:")
    # print(trips.head(10))
    # export_triplets_csv("triplets.csv")

    driver.close()



  graphName  nodeCount  relationshipCount
0  features         30                966


In [21]:
top_betweenness_df.to_csv(f"{path}/results/feature-only-KG/top_betweenness_{today}.csv")
top_degree_df.to_csv(f"{path}/results/feature-only-KG/top_degree_{today}.csv")
top_strength_df.to_csv(f"{path}/results/feature-only-KG/top_strength_{today}.csv")
hops_count_df.to_csv(f"{path}/results/feature-only-KG/hops_count_{today}.csv")
# louvain_df.to_csv(f"{path}/results/feature-only-KG/louvain.csv")
top_betweenness_noweight_df.to_csv(f"{path}/results/feature-only-KG/top_betweenness_noweight_{today}.csv")

In [5]:
from neo4j import GraphDatabase
import networkx as nx
import pandas as pd
from collections import defaultdict

# --- CONFIG ---
NEO4J_URI  = "bolt://localhost:7690"
NEO4J_AUTH = ("neo4j", "your_password")
REL_TYPES  = "CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE"  # undirected semantics

driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

def run_df(cypher, params=None):
    with driver.session() as s:
        data = s.run(cypher, params or {}).data()
    return pd.DataFrame(data)

# --- 0) Pull nodes & edges from Neo4j ---
# nodes: name + first label as "kind"
nodes_df = run_df("""
MATCH (n:Measure) RETURN n.name AS name, 'Measure' AS kind
UNION ALL
MATCH (n:DataType) RETURN n.name AS name, 'DataType' AS kind
UNION ALL
MATCH (n:RqType) RETURN n.name AS name, 'RqType' AS kind
""")

edges_df = run_df(f"""
MATCH (a)-[r:{REL_TYPES}]-(b)
RETURN a.name AS u, labels(a)[0] AS kind_u,
       b.name AS v, labels(b)[0] AS kind_v,
       coalesce(r.w,1) AS w, id(r) AS rel_id, type(r) AS rel_type
""")

# --- 1) Build undirected weighted graph in NetworkX ---
G = nx.Graph()
for _, row in nodes_df.iterrows():
    G.add_node(row["name"], kind=row["kind"])

for _, row in edges_df.iterrows():
    u, v = row["u"], row["v"]
    w    = float(row["w"])
    # accumulate weight if duplicates
    if G.has_edge(u, v):
        G[u][v]["weight"] += w
        G[u][v]["rel_ids"].append(row["rel_id"])
        G[u][v]["rel_types"].add(row["rel_type"])
    else:
        G.add_edge(u, v,
                   weight=w,
                   rel_ids=[row["rel_id"]],
                   rel_types={row["rel_type"]})

# --- 2a) Edge Betweenness Centrality (unweighted hops) ---
# Treat all edges equally for "fewest hops" bridges
edge_betweenness = nx.edge_betweenness_centrality(G, normalized=True)  # dict[(u,v)] -> score
edge_betweenness_df = pd.DataFrame(
    [(u, v, score) for (u, v), score in edge_betweenness.items()],
    columns=["u", "v", "edge_betweenness"]
)

# --- 2b) Edge Betweenness Centrality (weighted, using inverted weights) ---
# Assign inverted weight as 'length' attribute for shortest paths
for u, v, data in G.edges(data=True):
    data["length"] = 1.0 / data["weight"]

edge_betweenness_w = nx.edge_betweenness_centrality(G, normalized=True, weight="length")
edge_betweenness_w_df = pd.DataFrame(
    [(u, v, score) for (u, v), score in edge_betweenness_w.items()],
    columns=["u", "v", "edge_betweenness_weighted"]
)

# --- 3) Jaccard similarity (overlap of neighborhoods) for existing edges ---
def jaccard_for_edge(G, u, v):
    Nu = set(G.neighbors(u))
    Nv = set(G.neighbors(v))
    inter = Nu & Nv
    union = Nu | Nv
    return (len(inter) / len(union)) if union else 0.0

edge_jaccard = {(u, v): jaccard_for_edge(G, u, v) for u, v in G.edges()}
edge_jaccard_df = pd.DataFrame(
    [(u, v, score) for (u, v), score in edge_jaccard.items()],
    columns=["u", "v", "edge_jaccard"]
)




In [None]:
# edge_betweenness_df.to_csv(f"{path}/results/feature-only-KG/edge_betweenness.csv")
# edge_jaccard_df.to_csv(f"{path}/results/feature-only-KG/edge_jaccard.csv")
# edge_betweenness_w_df.to_csv(f"{path}/results/feature-only-KG/edge_betweenness_weighted.csv")

In [36]:
import networkx as nx

def edge_betweenness_per_period(period: str) -> pd.DataFrame:
    # Pull edges for the given period
    edges_df = run_df(f"""
    MATCH (a)-[r:{'|'.join(REL_TYPES)}]-(b)
    WHERE r.period = $period
    RETURN a.name AS u, labels(a)[0] AS kind_u,
           b.name AS v, labels(b)[0] AS kind_v,
           coalesce(r.w,1) AS w, id(r) AS rel_id, type(r) AS rel_type
    """, { "period": period })

    # Pull nodes (all types)
    nodes_df = run_df("""
    MATCH (n:Measure) RETURN n.name AS name, 'Measure' AS kind
    UNION ALL
    MATCH (n:DataType) RETURN n.name AS name, 'DataType' AS kind
    UNION ALL
    MATCH (n:RqType) RETURN n.name AS name, 'RqType' AS kind
    """)

    # Build graph
    G = nx.Graph()
    for _, row in nodes_df.iterrows():
        G.add_node(row["name"], kind=row["kind"])
    for _, row in edges_df.iterrows():
        u, v = row["u"], row["v"]
        w    = float(row["w"])
        if G.has_edge(u, v):
            G[u][v]["weight"] += w
            G[u][v]["rel_ids"].append(row["rel_id"])
            G[u][v]["rel_types"].add(row["rel_type"])
        else:
            G.add_edge(u, v,
                       weight=w,
                       rel_ids=[row["rel_id"]],
                       rel_types={row["rel_type"]})

    # Unweighted betweenness
    edge_betweenness = nx.edge_betweenness_centrality(G, normalized=True)
    edge_betweenness_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness.items()],
        columns=["u", "v", "edge_betweenness"]
    )

    # Weighted betweenness (inverse weight as length)
    for u, v, data in G.edges(data=True):
        data["length"] = 1.0 / data["weight"]
    edge_betweenness_w = nx.edge_betweenness_centrality(G, normalized=True, weight="length")
    edge_betweenness_w_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness_w.items()],
        columns=["u", "v", "edge_betweenness_weighted"]
    )

    # Merge and return
    return edge_betweenness_df.merge(edge_betweenness_w_df, on=["u", "v"], how="outer")


In [37]:
for p in df['period'].cat.categories:   # your 6 categorical periods
    period_df = edge_betweenness_per_period(p)
    os.makedirs(f"{path}/results/feature-only-KG/periods/edge_betweenness", exist_ok=True)
    period_df.to_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{p}_{today}.csv")




# 2. Temporal

In [22]:
df['period'].cat.categories

Index(['-2000', '2001-2005', '2006-2010', '2011-2015', '2016-2020',
       '2021-2025'],
      dtype='object')

In [27]:
for p in df['period'].cat.categories:
    print(f"\n--- Period: {p} ---")
    period_graph_projection(period=p)
    gname = graph_name_for_period(p)
    top_degree_df = top_degree(name=gname)
    top_strength_df = top_strength(name=gname)
    top_betweenness_df = top_betweenness(name=gname, weight_prop='inv_w')
    hops_count_df = hops_count(name=gname)
    top_betweenness_noweight_df = top_betweenness(name=gname, weight_prop=None)

    os.makedirs(f"{path}/results/feature-only-KG/periods", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_betweenness", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_degree", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_strength", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/hops_count", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_betweenness_noweight", exist_ok=True)

    top_betweenness_df.to_csv(f"{path}/results/feature-only-KG/periods/top_betweenness/{p}_{today}.csv")
    top_degree_df.to_csv(f"{path}/results/feature-only-KG/periods/top_degree/{p}_{today}.csv")
    top_strength_df.to_csv(f"{path}/results/feature-only-KG/periods/top_strength/{p}_{today}.csv")
    hops_count_df.to_csv(f"{path}/results/feature-only-KG/periods/hops_count/{p}_{today}.csv")
    top_betweenness_noweight_df.to_csv(f"{path}/results/feature-only-KG/periods/top_betweenness_noweight/{p}_{today}.csv")




--- Period: -2000 ---
         graphName  nodeCount  relationshipCount
0  features__-2000         30                 64

--- Period: 2001-2005 ---




             graphName  nodeCount  relationshipCount
0  features__2001-2005         30                 82

--- Period: 2006-2010 ---
             graphName  nodeCount  relationshipCount
0  features__2006-2010         30                128





--- Period: 2011-2015 ---
             graphName  nodeCount  relationshipCount
0  features__2011-2015         30                206

--- Period: 2016-2020 ---
             graphName  nodeCount  relationshipCount
0  features__2016-2020         30                240





--- Period: 2021-2025 ---
             graphName  nodeCount  relationshipCount
0  features__2021-2025         30                246


## Degree/2*triplets

In [126]:
# ‚ö†Ô∏è MAKE SURE that df is the one under 0.Libraries
# Run 2.Centrality Measures

cypher = """
// For Measures
MATCH (m:Measure)-[:CO_MEASURE_DATATYPE {period:$period}]-(d:DataType)
      -[:CO_DATATYPE_RQTYPE {period:$period}]-(r:RqType)
      -[:CO_RQTYPE_MEASURE {period:$period}]-(m)
WITH m, count(DISTINCT [m,d,r]) AS triplets
MATCH (m)-[rel {period:$period}]-(n)
WITH m.name AS node, 'Measure' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree

UNION ALL
// For DataTypes
MATCH (d:DataType)-[:CO_DATATYPE_RQTYPE {period:$period}]-(r:RqType)
      -[:CO_RQTYPE_MEASURE {period:$period}]-(m:Measure)
      -[:CO_MEASURE_DATATYPE {period:$period}]-(d)
WITH d, count(DISTINCT [m,d,r]) AS triplets
MATCH (d)-[rel {period:$period}]-(n)
WITH d.name AS node, 'DataType' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree

UNION ALL
// For RqTypes
MATCH (r:RqType)-[:CO_RQTYPE_MEASURE {period:$period}]-(m:Measure)
      -[:CO_MEASURE_DATATYPE {period:$period}]-(d:DataType)
      -[:CO_DATATYPE_RQTYPE {period:$period}]-(r)
WITH r, count(DISTINCT [m,d,r]) AS triplets
MATCH (r)-[rel {period:$period}]-(n)
WITH r.name AS node, 'RqType' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree
"""



for p in df['period'].cat.categories:
    print(f"\n--- Period: {p} ---")
    write_table = pd.DataFrame()
    period = p
    df = run_df(cypher, {"period": period})
    write_table = pd.concat([write_table, df], ignore_index=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/degree_normalized", exist_ok=True)
    write_table.to_csv(f"{path}/results/feature-only-KG/periods/degree_normalized/{p}_{today}.csv", index=False)




--- Period: -2000 ---

--- Period: 2001-2005 ---

--- Period: 2006-2010 ---

--- Period: 2011-2015 ---

--- Period: 2016-2020 ---

--- Period: 2021-2025 ---


# 4. Visualization

## Top Degree & Top Strength

In [25]:
top_degree_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_degree"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_degree/{file}")
        df['period'] = period
        top_degree_all[period] = df
all_top_degree_df = pd.concat(top_degree_all.values(), ignore_index=True)


top_strength_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_strength"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_strength/{file}")
        df['period'] = period
        top_strength_all[period] = df
all_top_strength_df = pd.concat(top_strength_all.values(), ignore_index=True)

In [26]:
import numpy as np
import pandas as pd
import plotly.express as px
import re
import unicodedata
import plotly.graph_objects as go
import plotly.subplots as sp


template_type = "none"
showlegend = True

def norm(s):
    if pd.isna(s): return s
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"[\u2010\u2011\u2012\u2013\u2014\u2212]", "-", s)  # dash variants ‚Üí "-"
    s = re.sub(r"\s+", " ", s).strip()
    return s

deg_df = all_top_degree_df.copy()
str_df = all_top_strength_df.copy()

if "strength" in str_df.columns:
    str_df = str_df.rename(columns={"strength": "score"})

deg_df["metric"] = "degree"
str_df["metric"] = "strength"

long_df = pd.concat(
    [deg_df[["kind","node","score","period","metric"]],
     str_df[["kind","node","score","period","metric"]]],
    ignore_index=True
)

long_df["node"] = long_df["node"].apply(norm)

# Period order
period_order = ["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"]
long_df["period"] = pd.Categorical(long_df["period"], categories=period_order, ordered=True)

# ---- choose which nodes to color; everything else will be grey ----
HIGHLIGHTS = {"Intergenerational Wealth Mobility and Inheritance", "Regression-based Measures", "Empirical Estimates and Determinants", "Panel/Longitudinal Surveys"}

# Build a color group: each highlighted node gets its own label; all others -> "Other"
long_df["color_group"] = np.where(long_df["node"].isin(HIGHLIGHTS), long_df["node"], "Other")

# Assign Dark2 colors to highlighted nodes; grey for "Other"
palette = {name: px.colors.qualitative.Dark2[i % len(px.colors.qualitative.Dark2)] for i, name in enumerate(sorted(HIGHLIGHTS))}
palette["Other"] = "#C7C7C7"

# ---------------------------
# Helpers: size scaling
# ---------------------------
def scaled_size(series, min_size=6, max_size=28, transform=None):
    """Return a numpy array of marker sizes from a pandas Series."""
    x = series.astype(float).to_numpy()
    if transform is not None:
        x = transform(x)
    if x.size == 0:
        return x
    lo, hi = np.nanmin(x), np.nanmax(x)
    if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo < 1e-12:
        return np.full_like(x, (min_size + max_size) / 2.0)
    return min_size + (x - lo) / (hi - lo) * (max_size - min_size)

# Make per-metric size columns (global scaling across periods for comparability)
deg_mask = long_df["metric"] == "degree"
str_mask = long_df["metric"] == "strength"

sizes_deg = pd.Series(index=long_df.index, dtype=float)
sizes_str = pd.Series(index=long_df.index, dtype=float)

sizes_deg.loc[deg_mask] = scaled_size(long_df.loc[str_mask, "score"], min_size=6, max_size=28, transform=None)
# strength: use sqrt to compress heavy tail
sizes_str.loc[str_mask] = scaled_size(long_df.loc[deg_mask, "score"], min_size=6, max_size=28, transform=None)

long_df["size_deg"] = sizes_deg
long_df["size_str"] = sizes_str

# ------------------------------
# A) Beeswarm / strip: Degree over periods (size by degree)
# ------------------------------
df_deg = long_df[deg_mask].copy()

fig_degree_strip = px.scatter(
    df_deg,
    x="period", y="score",
    color="color_group",
    size = "size_deg",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    color_discrete_map=palette,
    title="Degree distribution per period (size ‚àù Strength)" # intentional: size by strength for degree plot
)
fig_degree_strip.update_layout(
    template=template_type, 
    showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)", 
    # paper_bgcolor = "rgba(0,0,0,0)",
    )

# ------------------------------
# B) Beeswarm / strip: Strength over periods 
# ------------------------------
df_str = long_df[str_mask].copy()

fig_strength_strip = px.scatter(
    df_str,
    x="period", y="score",
    color="color_group",
    size = "size_str",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    color_discrete_map=palette,
    title="Strength distribution per period (size ‚àù Degree)", # intentional: size by degree for strength plot
    # log_y=True,
)

fig_strength_strip.update_layout(
    yaxis_title="Strength (Œ£ weights)",
    legend_title_text="Node (highlighted only)"
)

fig_strength_strip.update_layout(
    template=template_type, 
    showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)", 
    # paper_bgcolor = "rgba(0,0,0,0)"
    )

# ------------------------------
# C) Strength vs Degree scatter, faceted by period (optional)
# ------------------------------
deg_p = long_df[deg_mask][["period","node","kind","score","color_group"]].rename(columns={"score":"degree"})
str_p = long_df[str_mask][["period","node","score"]].rename(columns={"score":"strength"})
sd = deg_p.merge(str_p, on=["period","node"], how="inner")

sd['log_y+1']= np.log10(sd["strength"] + 1)
sd = sd.sort_values(by=["period","degree"], ascending=[True,False])

fig_sd = sp.make_subplots(rows=2, cols=3, subplot_titles=period_order)

for idx, period in enumerate(period_order):
    if period not in sd['period'].values:
        continue
        
    period_data = sd[sd['period'] == period]
    
    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1
    
    # Add scatter points
    fig_sd.add_trace(go.Scatter(
        x=period_data['degree'], 
        y=period_data['log_y+1'],
        mode='markers',
        marker=dict(
            size=8,
            color=[palette.get(cg, "#C7C7C7") for cg in period_data['color_group']]
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Degree: %{x}<br>" +
            "Strength: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['strength']), axis=-1),
        showlegend=False
    ), row=row, col=col)  # Added row and col here
    
    # Calculate OLS fit
    z = np.polyfit(period_data['degree'], period_data['log_y+1'], 1)
    p = np.poly1d(z)
    x_trend = np.linspace(period_data['degree'].min(), period_data['degree'].max(), 100)
    y_trend = p(x_trend)
    
    # Add trendline
    fig_sd.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=2),
            showlegend=False,
            hoverinfo='skip',
        ),
        row=row, col=col
    )

fig_sd.update_layout(
    template=template_type, 
    showlegend=showlegend,
    title_text="Strength vs Degree by period",
    
)

fig_sd.update_xaxes(range=[0, 20])
fig_sd.update_xaxes(title_text="degree", row=2)
fig_sd.update_yaxes(title_text="log_y+1")

fig_sd.update_layout(template=template_type, showlegend=showlegend, 
                    #  plot_bgcolor = "rgba(0,0,0,0)", 
                    #  paper_bgcolor = "rgba(0,0,0,0)"
                     )

fig_degree_strip.show()
fig_strength_strip.show()
fig_sd.show()
# fig_sd_simple.show()

# fig_degree_strip.write_html(f"{path}/results/feature-only-KG/img/degree_over_time.html")
# fig_strength_strip.write_html(f"{path}/results/feature-only-KG/img/strength_over_time.html")
# fig_sd.write_html(f"{path}/results/feature-only-KG/img/strength_vs_degree.html")

# fig_degree_strip.write_image(f"{path}/results/feature-only-KG/img/degree_over_time_legend.png", width=800, height=600, scale=2)
# fig_strength_strip.write_image(f"{path}/results/feature-only-KG/img/strength_over_time_legend.png", width=800, height=600, scale=2)
# fig_sd.write_image(f"{path}/results/feature-only-KG/img/strength_vs_degree_legend.png", width=1200, height=800, scale=2)


## Node Betweenness & Edge Betweenness

In [27]:
top_betweenness = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_betweenness"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_betweenness/{file}")
        df['period'] = period
        top_betweenness[period] = df
all_top_betweenness_df = pd.concat(top_betweenness.values(), ignore_index=True)


edge_betweenness = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/edge_betweenness"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{file}")
        df['period'] = period
        edge_betweenness[period] = df
all_edge_betweenness_df = pd.concat(edge_betweenness.values(), ignore_index=True)

In [28]:
showlegend = True

all_top_betweenness_df["node"] = all_top_betweenness_df["node"].apply(norm)
all_top_betweenness_df["color_group"] = np.where(all_top_betweenness_df["node"].isin(HIGHLIGHTS), all_top_betweenness_df["node"], "Other")

all_edge_betweenness_df["u"] = all_edge_betweenness_df["u"].apply(norm)
all_edge_betweenness_df["v"] = all_edge_betweenness_df["v"].apply(norm)
v_highlights = ["Intergenerational Wealth Mobility and Inheritance", "Empirical Estimates and Determinants", "No dataset", "National Survey Data", "Panel/Longitudinal Surveys"]
all_edge_betweenness_df["color_group"] = np.where(all_edge_betweenness_df["u"].isin(["Regression-based Measures", "No dataset"]) & all_edge_betweenness_df["v"].isin(v_highlights), all_edge_betweenness_df["v"], "Other")
edge_palette = {name: px.colors.qualitative.Dark2[i % len(px.colors.qualitative.Dark2)] for i, name in enumerate(sorted(set(all_edge_betweenness_df["color_group"]) - {"Other"}))}
edge_palette["Other"] = "#C7C7C7"

all_top_betweenness_df["rank"] = all_top_betweenness_df.groupby("period")["score"].rank(ascending=False, method="min")
all_edge_betweenness_df["rank"] = all_edge_betweenness_df.groupby("period")["edge_betweenness_weighted"].rank(ascending=False, method="min")
all_top_betweenness_df["prev_rank"] = all_top_betweenness_df.groupby("period")["rank"].shift(1)
all_top_betweenness_df['prev_rank'] = all_top_betweenness_df['prev_rank'].fillna(1)
all_edge_betweenness_df["prev_rank"] = all_edge_betweenness_df.groupby("period")["rank"].shift(1)
all_edge_betweenness_df['prev_rank'] = all_edge_betweenness_df['prev_rank'].fillna(1)

max_size = 30
min_size = 6

def linear_size(rank, max_rank, min_size, max_size):
    if max_rank == 1:
        return max_size
    return max_size - (rank - 1) * (max_size - min_size) / (max_rank - 1)

# Compute size per period
all_top_betweenness_df["size"] = all_top_betweenness_df.groupby("period").apply(
    lambda g: g["prev_rank"].apply(lambda r: linear_size(r, g["prev_rank"].max(), min_size, max_size))
).reset_index(level=0, drop=True)

all_edge_betweenness_df["size"] = all_edge_betweenness_df.groupby("period").apply(
    lambda g: g["prev_rank"].apply(lambda r: linear_size(r, g["prev_rank"].max(), min_size, max_size))
).reset_index(level=0, drop=True)
# ------------------------------

fig_btw = px.scatter(
    all_top_betweenness_df,
    x="period", y="score",
    size = "size", # ‚ö†Ô∏è Change the size to the previous ranks?
    color = "color_group",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    title="Betweenness distribution per period (size ‚àù betweenness)",
    color_discrete_map=palette
)
fig_btw.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )

fig_ebtw = px.scatter(
    all_edge_betweenness_df,
    x="period", y="edge_betweenness_weighted",
    color="color_group",
    size = "size",
    hover_data=["u","v","edge_betweenness_weighted"],
    category_orders={"period": period_order},
    title="Weighted Edge Betweenness distribution per period (size ‚àù weighted edge betweenness)",
    color_discrete_map=edge_palette
)
fig_ebtw.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )



fig_btw.show()
fig_ebtw.show()

# fig_btw.write_html(f"{path}/results/feature-only-KG/img/betweenness_over_time.html")
# fig_ebtw.write_html(f"{path}/results/feature-only-KG/img/edge_betweenness_over_time.html")
# fig_btw.write_image(f"{path}/results/feature-only-KG/img/betweenness_over_time_legend.png", width=800, height=600, scale=2)
# fig_ebtw.write_image(f"{path}/results/feature-only-KG/img/edge_betweenness_over_time_legend.png", width=800, height=600, scale=2)

    







### Cohen's kappa for node/edge betweenness 

In [10]:
from sklearn.metrics import cohen_kappa_score

period_order = ["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"]

def compute_node_kappa(all_top_betweenness_df, period_order):
    data = all_top_betweenness_df.copy()

    kappa_node_df = []
    for i, p in enumerate(period_order):
        if p == period_order[-1]:
            break
        period_A = data[data['period'] == p].copy()
        period_B = data[data['period'] == period_order[i+1]].copy()
        rater1 = period_A['rank'].to_list()
        rater2 = period_B['rank'].to_list()
        cohen_kappa = cohen_kappa_score(rater1, rater2)
        print(f"Cohen's Kappa between {p} and {period_order[i+1]}: {cohen_kappa:.4f}")
        kappa_node_df.append({
            "period_A": p,
            "period_B": period_order[i+1],
            "cohen_kappa": cohen_kappa
        })
    kappa_node_df = pd.DataFrame(kappa_node_df)
    return kappa_node_df

kappa_node_df = compute_node_kappa(all_top_betweenness_df, period_order)
kappa_node_df

Cohen's Kappa between -2000 and 2001-2005: 0.1379
Cohen's Kappa between 2001-2005 and 2006-2010: 0.1724
Cohen's Kappa between 2006-2010 and 2011-2015: 0.2571
Cohen's Kappa between 2011-2015 and 2016-2020: 0.2584
Cohen's Kappa between 2016-2020 and 2021-2025: 0.2414


Unnamed: 0,period_A,period_B,cohen_kappa
0,-2000,2001-2005,0.137931
1,2001-2005,2006-2010,0.172414
2,2006-2010,2011-2015,0.257075
3,2011-2015,2016-2020,0.258427
4,2016-2020,2021-2025,0.241379


In [173]:
all_edge_betweenness_df.groupby("period").size()

period
-2000         32
2001-2005     41
2006-2010     64
2011-2015    103
2016-2020    120
2021-2025    123
dtype: int64

In [11]:
from sklearn.metrics import cohen_kappa_score

def compute_edge_kappa(all_edge_betweenness_df, period_order):
    data = all_edge_betweenness_df.copy()
    align_cols = ["u", "v"]
            
    kappa_edge_df = []
    for i, p in enumerate(period_order):
        if p == period_order[-1]:
            break
        period_A = data[data['period'] == p].copy()
        period_B = data[data['period'] == period_order[i+1]].copy()
        # Align by node or edge
        merged = pd.merge(period_A, period_B, on=align_cols, suffixes=('_A', '_B'))
        rater1 = merged['rank_A'].to_list()
        rater2 = merged['rank_B'].to_list()
        if len(rater1) == 0 or len(rater2) == 0:
            cohen_kappa = np.nan
        else:
            cohen_kappa = cohen_kappa_score(rater1, rater2)
        print(f"Cohen's Kappa between {p} and {period_order[i+1]}: {cohen_kappa:.4f}")
        kappa_edge_df.append({
            "period_A": p,
            "period_B": period_order[i+1],
            "cohen_kappa": cohen_kappa
        })
    kappa_edge_df = pd.DataFrame(kappa_edge_df)
    return kappa_edge_df

kappa_edge_df = compute_edge_kappa(all_edge_betweenness_df, period_order)
kappa_edge_df

Cohen's Kappa between -2000 and 2001-2005: 0.0175
Cohen's Kappa between 2001-2005 and 2006-2010: -0.0116
Cohen's Kappa between 2006-2010 and 2011-2015: 0.0126
Cohen's Kappa between 2011-2015 and 2016-2020: 0.0265
Cohen's Kappa between 2016-2020 and 2021-2025: 0.0179


Unnamed: 0,period_A,period_B,cohen_kappa
0,-2000,2001-2005,0.017476
1,2001-2005,2006-2010,-0.011561
2,2006-2010,2011-2015,0.012633
3,2011-2015,2016-2020,0.026549
4,2016-2020,2021-2025,0.017872


In [12]:
showlegend = False

all_top_betweenness_df["node"] = all_top_betweenness_df["node"].apply(norm)
all_top_betweenness_df["color_group"] = np.where(all_top_betweenness_df["node"].isin(HIGHLIGHTS), all_top_betweenness_df["node"], "Other")

all_edge_betweenness_df["u"] = all_edge_betweenness_df["u"].apply(norm)
all_edge_betweenness_df["v"] = all_edge_betweenness_df["v"].apply(norm)
v_highlights = ["Intergenerational Wealth Mobility and Inheritance", "Empirical Estimates and Determinants", "No dataset", "National Survey Data", "Panel/Longitudinal Surveys"]
all_edge_betweenness_df["color_group"] = np.where(all_edge_betweenness_df["u"].isin(["Regression-based Measures", "No dataset"]) & all_edge_betweenness_df["v"].isin(v_highlights), all_edge_betweenness_df["v"], "Other")
edge_palette = {name: px.colors.qualitative.Dark2[i % len(px.colors.qualitative.Dark2)] for i, name in enumerate(sorted(set(all_edge_betweenness_df["color_group"]) - {"Other"}))}
edge_palette["Other"] = "#C7C7C7"


# Compute size per period
all_top_betweenness_df["size"] = all_top_betweenness_df.groupby("period").apply(
    lambda g: g["prev_rank"].apply(lambda r: linear_size(r, g["prev_rank"].max(), min_size, max_size))
).reset_index(level=0, drop=True)

all_edge_betweenness_df["size"] = all_edge_betweenness_df.groupby("period").apply(
    lambda g: g["prev_rank"].apply(lambda r: linear_size(r, g["prev_rank"].max(), min_size, max_size))
).reset_index(level=0, drop=True)
# ------------------------------

fig_btw = px.scatter(
    all_top_betweenness_df,
    x="period", y="score",
    color = "color_group",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    title="Betweenness distribution per period (size ‚àù betweenness)",
    color_discrete_map=palette
)

# Add Cohen's kappa line with secondary y-axis
fig_btw.add_trace(
    go.Bar(
        x=kappa_node_df['period_B'],
        y=kappa_node_df['cohen_kappa'],
        name="Cohen's kappa",
        marker=dict(color="rgba(230,171,2,0.3)"),
        # marker=dict(symbol="circle", size=10, color="black"),
        yaxis="y2",
        hovertemplate="Period: %{x}<br>Kappa: %{y:.3f}<extra></extra>"
    )
)

# Update layout for secondary y-axis
fig_btw.update_layout(
    yaxis2=dict(
        title="Cohen's kappa",
        overlaying="y",
        side="right",
        range=[0, 1],
        showgrid=False
    ),
    template=template_type,
    showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
)
fig_btw.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )

fig_ebtw = px.scatter(
    all_edge_betweenness_df,
    x="period", y="edge_betweenness_weighted",
    color="color_group",
    hover_data=["u","v","edge_betweenness_weighted"],
    category_orders={"period": period_order},
    title="Weighted Edge Betweenness distribution per period (size ‚àù weighted edge betweenness)",
    color_discrete_map=edge_palette
)
fig_ebtw.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )



fig_btw.show()
fig_ebtw.show()

# fig_btw.write_html(f"{path}/results/feature-only-KG/img/betweenness_over_time.html")
# fig_ebtw.write_html(f"{path}/results/feature-only-KG/img/edge_betweenness_over_time.html")
# fig_btw.write_image(f"{path}/results/feature-only-KG/img/betweenness_over_time_legend.png", width=800, height=600, scale=2)
# fig_ebtw.write_image(f"{path}/results/feature-only-KG/img/edge_betweenness_over_time_legend.png", width=800, height=600, scale=2)

    







### Let's do this with complete and non-zero data

In [13]:
edge_btw = all_edge_betweenness_df[all_edge_betweenness_df['edge_betweenness_weighted'] != 0]
complete_pair_names = edge_btw.groupby(['u','v']).size().reset_index(name='count')
consistent_pairs = complete_pair_names[complete_pair_names['count'] == len(period_order)]
edge_btw = edge_btw[edge_btw.set_index(['u','v']).index.isin(consistent_pairs.set_index(['u','v']).index)].copy()
edge_btw

Unnamed: 0.1,Unnamed: 0,u,v,edge_betweenness,edge_betweenness_weighted,period,color_group,rank,prev_rank,size
27,27,Decomposition / Structural Approaches,No dataset,0.007405,0.002299,2016-2020,Other,39.0,47.0,6.0
54,54,No dataset,Theoretical and Structural Models,0.009495,0.057471,2016-2020,Other,12.0,47.0,6.0
96,96,Regression-based Measures,Empirical Estimates and Determinants,0.008378,0.181609,2016-2020,Empirical Estimates and Determinants,1.0,16.0,22.173913
97,97,Regression-based Measures,Intergenerational Wealth Mobility and Inheritance,0.009752,0.170115,2016-2020,Intergenerational Wealth Mobility and Inheritance,2.0,1.0,30.0
99,99,Regression-based Measures,Linked Administrative Data,0.007904,0.045977,2016-2020,Other,25.0,23.0,18.521739
104,104,Regression-based Measures,No dataset,0.008282,0.096552,2016-2020,No dataset,4.0,47.0,6.0
106,106,Regression-based Measures,Panel/Longitudinal Surveys,0.007656,0.087356,2016-2020,Panel/Longitudinal Surveys,6.0,47.0,6.0
143,23,Decomposition / Structural Approaches,No dataset,0.008119,0.029885,2021-2025,Other,25.0,54.0,6.0
172,52,No dataset,Theoretical and Structural Models,0.013139,0.05977,2021-2025,Other,11.0,54.0,6.0
218,98,Regression-based Measures,Empirical Estimates and Determinants,0.008505,0.183908,2021-2025,Empirical Estimates and Determinants,2.0,54.0,6.0


In [14]:
complete_pair_names['count'].value_counts()

count
1    48
2    21
3    14
4    12
6     7
5     6
Name: count, dtype: int64

In [15]:
kappa_edge_consistent_df = compute_edge_kappa(edge_btw, period_order)
kappa_edge_consistent_df

Cohen's Kappa between -2000 and 2001-2005: -0.0652
Cohen's Kappa between 2001-2005 and 2006-2010: -0.0208
Cohen's Kappa between 2006-2010 and 2011-2015: 0.1064
Cohen's Kappa between 2011-2015 and 2016-2020: 0.0667
Cohen's Kappa between 2016-2020 and 2021-2025: 0.0667


Unnamed: 0,period_A,period_B,cohen_kappa
0,-2000,2001-2005,-0.065217
1,2001-2005,2006-2010,-0.020833
2,2006-2010,2011-2015,0.106383
3,2011-2015,2016-2020,0.066667
4,2016-2020,2021-2025,0.066667


In [16]:
node_btw = all_top_betweenness_df[all_top_betweenness_df['score'] != 0]
complete_node_names = node_btw.groupby(['node']).size().reset_index(name='count')
consistent_nodes = complete_node_names[complete_node_names['count'] == len(period_order)]
node_btw = node_btw[node_btw['node'].isin(consistent_nodes['node'])].copy()
node_btw 

Unnamed: 0.1,Unnamed: 0,kind,node,score,period,color_group,rank,prev_rank,size
0,0,Measure,Regression-based Measures,462.0,2016-2020,Regression-based Measures,1.0,1.0,30.0
1,1,RqType,Intergenerational Wealth Mobility and Inheritance,176.0,2016-2020,Intergenerational Wealth Mobility and Inheritance,2.0,1.0,30.0
5,5,DataType,No dataset,52.0,2016-2020,Other,6.0,5.0,19.333333
30,0,Measure,Regression-based Measures,492.0,2021-2025,Regression-based Measures,1.0,1.0,30.0
31,1,RqType,Intergenerational Wealth Mobility and Inheritance,208.0,2021-2025,Intergenerational Wealth Mobility and Inheritance,2.0,1.0,30.0
33,3,DataType,No dataset,120.0,2021-2025,Other,4.0,3.0,23.142857
60,0,RqType,Intergenerational Wealth Mobility and Inheritance,117.0,2001-2005,Intergenerational Wealth Mobility and Inheritance,1.0,1.0,30.0
61,1,DataType,No dataset,88.666667,2001-2005,Other,2.0,1.0,30.0
63,3,Measure,Regression-based Measures,53.666667,2001-2005,Regression-based Measures,4.0,3.0,20.4
90,0,RqType,Intergenerational Wealth Mobility and Inheritance,119.166667,-2000,Intergenerational Wealth Mobility and Inheritance,1.0,1.0,30.0


In [17]:
complete_node_names['count'].value_counts()

count
1    11
6     3
5     2
3     1
2     1
Name: count, dtype: int64

In [18]:
kappa_node_consistent_df = compute_node_kappa(node_btw, period_order)
kappa_node_consistent_df

Cohen's Kappa between -2000 and 2001-2005: 0.5714
Cohen's Kappa between 2001-2005 and 2006-2010: 0.5714
Cohen's Kappa between 2006-2010 and 2011-2015: 0.5714
Cohen's Kappa between 2011-2015 and 2016-2020: 0.5714
Cohen's Kappa between 2016-2020 and 2021-2025: 0.5714


Unnamed: 0,period_A,period_B,cohen_kappa
0,-2000,2001-2005,0.571429
1,2001-2005,2006-2010,0.571429
2,2006-2010,2011-2015,0.571429
3,2011-2015,2016-2020,0.571429
4,2016-2020,2021-2025,0.571429


In [20]:
all_top_betweenness_df[all_top_betweenness_df['score'] == 0].tail()

Unnamed: 0.1,Unnamed: 0,kind,node,score,period,color_group,rank,prev_rank,size
175,25,DataType,Opportunity Atlas,0.0,2006-2010,Other,9.0,9.0,6.0
176,26,RqType,Perceptions of Mobility and Attitudes,0.0,2006-2010,Other,9.0,9.0,6.0
177,27,RqType,Others_RqType,0.0,2006-2010,Other,9.0,9.0,6.0
178,28,DataType,University/Institution Data,0.0,2006-2010,Other,9.0,9.0,6.0
179,29,DataType,Big Data,0.0,2006-2010,Other,9.0,9.0,6.0


### Compare the betweenness (both node- and edge-wise) by setting the non-existing nodes as 0 betweenness

#### Node-wise

In [8]:
for p in all_top_betweenness_df['period'].unique():
    period_df = all_top_betweenness_df[all_top_betweenness_df['period'] == p]
    print(f"Period: {p}, Zero betweenness count: {(period_df['score'] == 0).sum()}, Total count: {len(period_df)}, Total nodes: {period_df['node'].nunique()}")

Period: 2016-2020, Zero betweenness count: 21, Total count: 30, Total nodes: 30
Period: 2021-2025, Zero betweenness count: 23, Total count: 30, Total nodes: 30
Period: 2001-2005, Zero betweenness count: 25, Total count: 30, Total nodes: 30
Period: -2000, Zero betweenness count: 26, Total count: 30, Total nodes: 30
Period: 2011-2015, Zero betweenness count: 19, Total count: 30, Total nodes: 30
Period: 2006-2010, Zero betweenness count: 22, Total count: 30, Total nodes: 30


It looks like each period gets 30 nodes in total. And probably those that didn't appear are set to 0

In [29]:
from scipy.stats import kendalltau

period_order = ["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"]

def compute_kendall_tau(all_top_betweenness_df, period_order):
    data = all_top_betweenness_df.copy()

    kendall_tau_df = []
    for i, p in enumerate(period_order):
        if p == period_order[-1]:
            break
        period_A = data[data['period'] == p].copy()
        period_B = data[data['period'] == period_order[i+1]].copy()
        rater1 = period_A['rank'].to_list()
        rater2 = period_B['rank'].to_list()
        tau, p_value = kendalltau(rater1, rater2)
        print(f"Kendall's Tau between {p} and {period_order[i+1]}: {tau:.4f} (p-value: {p_value:.4f})")
        kendall_tau_df.append({
            "period_A": p,
            "period_B": period_order[i+1],
            "kendall_tau": tau,
            "p_value": p_value
        })
    kendall_tau_df = pd.DataFrame(kendall_tau_df)
    return kendall_tau_df


kendall_tau_df = compute_kendall_tau(all_top_betweenness_df, period_order)
kendall_tau_df

Kendall's Tau between -2000 and 2001-2005: 0.9027 (p-value: 0.0000)
Kendall's Tau between 2001-2005 and 2006-2010: 0.8135 (p-value: 0.0000)
Kendall's Tau between 2006-2010 and 2011-2015: 0.8781 (p-value: 0.0000)
Kendall's Tau between 2011-2015 and 2016-2020: 0.9185 (p-value: 0.0000)
Kendall's Tau between 2016-2020 and 2021-2025: 0.8994 (p-value: 0.0000)


Unnamed: 0,period_A,period_B,kendall_tau,p_value
0,-2000,2001-2005,0.902671,2.593841e-07
1,2001-2005,2006-2010,0.813489,1.600421e-06
2,2006-2010,2011-2015,0.878072,5.90287e-08
3,2011-2015,2016-2020,0.918466,1.079822e-08
4,2016-2020,2021-2025,0.899383,5.397147e-08


In [34]:
for i, p in enumerate(period_order):
    print(i, p)

0 -2000
1 2001-2005
2 2006-2010
3 2011-2015
4 2016-2020
5 2021-2025


In [None]:
sub_all_top_betweenness_df = all_top_betweenness_df[all_top_betweenness_df['score'] != 0]

subset_kendall_tau = []
for i in range(len(period_order) - 1):
    p1 = sub_all_top_betweenness_df[sub_all_top_betweenness_df['period'] == period_order[i]]
    p2 = sub_all_top_betweenness_df[sub_all_top_betweenness_df['period'] == period_order[i+1]]
    
    p1 = p1[p1['node'].isin(p2['node'])]
    p2 = p2[p2['node'].isin(p1['node'])]
    print(len(p1), len(p2))
    
    p1['new_rank'] = p1['score'].rank(ascending=False, method='min')
    p2['new_rank'] = p2['score'].rank(ascending=False, method='min')

    tau, p_value = kendalltau(p1['new_rank'], p2['new_rank'])
    # tau, p_value = kendalltau(p1['rank'], p2['rank']) # should be the same
    subset_kendall_tau.append({
        "period_A": period_order[i],
        "period_B": period_order[i+1],
        "kendall_tau": tau,
        "p_value": p_value,
        "sample_size": len(p1)
    })
    print(f"Kendall's Tau between {period_order[i]} and {period_order[i+1]}: {tau:.4f} (p-value: {p_value:.4f}), Sample Size: {len(p1)}")

subset_kendall_tau_df = pd.DataFrame(subset_kendall_tau)
subset_kendall_tau_df

3 3
Kendall's Tau between -2000 and 2001-2005: 1.0000 (p-value: 0.3333), Sample Size: 3
5 5
Kendall's Tau between 2001-2005 and 2006-2010: 1.0000 (p-value: 0.0167), Sample Size: 5
6 6
Kendall's Tau between 2006-2010 and 2011-2015: 1.0000 (p-value: 0.0028), Sample Size: 6
6 6
Kendall's Tau between 2011-2015 and 2016-2020: 0.9661 (p-value: 0.0074), Sample Size: 6
6 6
Kendall's Tau between 2016-2020 and 2021-2025: 1.0000 (p-value: 0.0028), Sample Size: 6


Unnamed: 0,period_A,period_B,kendall_tau,p_value,sample_size
0,-2000,2001-2005,1.0,0.333333,3
1,2001-2005,2006-2010,1.0,0.016667,5
2,2006-2010,2011-2015,1.0,0.002778,6
3,2011-2015,2016-2020,0.966092,0.00741,6
4,2016-2020,2021-2025,1.0,0.002778,6


#### Edge-wise

In [30]:
all_edge_betweenness_df.columns

Index(['Unnamed: 0', 'u', 'v', 'edge_betweenness', 'edge_betweenness_weighted',
       'period', 'color_group', 'rank', 'prev_rank', 'size'],
      dtype='object')

In [None]:
for p in all_edge_betweenness_df['period'].unique():
    period_df = all_edge_betweenness_df[all_edge_betweenness_df['period'] == p]
    print(f"Period: {p}, Zero betweenness count: {(period_df['edge_betweenness_weighted'] == 0).sum()}, Total count: {len(period_df)}, Total edges: {period_df[['u','v']].drop_duplicates().shape[0]}")

Period: 2016-2020, Zero betweenness count: 74, Total count: 120, Total edges: 120
Period: 2021-2025, Zero betweenness count: 70, Total count: 123, Total edges: 123
Period: 2001-2005, Zero betweenness count: 5, Total count: 41, Total edges: 41
Period: -2000, Zero betweenness count: 6, Total count: 32, Total edges: 32
Period: 2011-2015, Zero betweenness count: 51, Total count: 103, Total edges: 103
Period: 2006-2010, Zero betweenness count: 25, Total count: 64, Total edges: 64


In [57]:
import pandas as pd

# Get all unique node pairs (u, v) across all periods
all_pairs = all_edge_betweenness_df[['u', 'v']].drop_duplicates()

# Build a MultiIndex for all combinations of (period, u, v)

periods = all_edge_betweenness_df['period'].unique()
multi_index = pd.MultiIndex.from_product([periods, all_pairs['u'], all_pairs['v']], names=['period', 'u', 'v'])

# Filter to only valid undirected pairs (u != v and sorted)
valid_pairs = pd.DataFrame(multi_index.to_list(), columns=['period', 'u', 'v'])
valid_pairs = valid_pairs[valid_pairs['u'] <= valid_pairs['v']]  # keep one direction for undirected

# Merge with original data, fill missing with 0
complete_edge_betweenness = valid_pairs.merge(
    all_edge_betweenness_df,
    on=['period', 'u', 'v'],
    how='left'
).fillna({'edge_betweenness_weighted': 0})

# If you want only the columns period, u, v, edge_betweenness_weighted:
complete_edge_betweenness = complete_edge_betweenness[['period', 'u', 'v', 'edge_betweenness_weighted']]
complete_edge_betweenness['rank'] = complete_edge_betweenness.groupby('period')['edge_betweenness_weighted'].rank(ascending=False, method='min')

# Kendall's tau on the completed dataframe for each period pair
complete_edge_tau = []
for i in range(len(period_order) - 1):
    p1 = complete_edge_betweenness[complete_edge_betweenness['period'] == period_order[i]]
    p2 = complete_edge_betweenness[complete_edge_betweenness['period'] == period_order[i+1]]
    print(len(p1), len(p2))
    tau, p_value = kendalltau(p1['rank'], p2['rank'])
    complete_edge_tau.append({
        "period_A": period_order[i],
        "period_B": period_order[i+1],
        "kendall_tau": tau,
        "p_value": p_value
    })
complete_edge_tau_df = pd.DataFrame(complete_edge_tau)
complete_edge_tau_df
    

11676 11676
11676 11676
11676 11676
11676 11676
11676 11676


Unnamed: 0,period_A,period_B,kendall_tau,p_value
0,-2000,2001-2005,0.412644,0.0
1,2001-2005,2006-2010,0.581819,0.0
2,2006-2010,2011-2015,0.364353,0.0
3,2011-2015,2016-2020,0.075363,5.108362e-18
4,2016-2020,2021-2025,0.545985,0.0


In [61]:
from scipy.stats import kendalltau

subset_edge_kendall_tau = []

for i in range(len(period_order) - 1):
    A, B = period_order[i], period_order[i+1]

    # Select & copy to avoid SettingWithCopyWarning
    p1 = (all_edge_betweenness_df
          .loc[all_edge_betweenness_df['period'] == A, ['u','v','edge_betweenness_weighted']]
          .copy())
    p2 = (all_edge_betweenness_df
          .loc[all_edge_betweenness_df['period'] == B, ['u','v','edge_betweenness_weighted']]
          .copy())

    # Stable edge key
    p1['edge_key'] = p1['u'].astype(str) + '||' + p1['v'].astype(str)
    p2['edge_key'] = p2['u'].astype(str) + '||' + p2['v'].astype(str)

    # Keep only common edges via inner merge
    merged = (p1[['edge_key','edge_betweenness_weighted']]
              .merge(p2[['edge_key','edge_betweenness_weighted']],
                     on='edge_key', suffixes=('_A','_B')))

    # Drop NaNs if any
    merged = merged.dropna(subset=['edge_betweenness_weighted_A','edge_betweenness_weighted_B'])

    # Rank (highest betweenness gets rank 1)
    merged['rank_A'] = merged['edge_betweenness_weighted_A'].rank(ascending=False, method='min')
    merged['rank_B'] = merged['edge_betweenness_weighted_B'].rank(ascending=False, method='min')

    n = len(merged)
    print(n, n)

    tau, p_value = kendalltau(merged['rank_A'], merged['rank_B'])  # tau-b by default in recent SciPy
    print(f"Kendall's Tau between {A} and {B}: {tau:.4f} (p-value: {p_value:.4f}), Sample Size: {n}")

    subset_edge_kendall_tau.append({
        "period_A": A, "period_B": B,
        "kendall_tau": tau, "p_value": p_value, "sample_size": n
    })

subset_edge_kendall_tau_df = pd.DataFrame(subset_edge_kendall_tau)
subset_edge_kendall_tau_df

23 23
Kendall's Tau between -2000 and 2001-2005: 0.2720 (p-value: 0.0820), Sample Size: 23
35 35
Kendall's Tau between 2001-2005 and 2006-2010: 0.1371 (p-value: 0.2832), Sample Size: 35
55 55
Kendall's Tau between 2006-2010 and 2011-2015: 0.5684 (p-value: 0.0000), Sample Size: 55
80 80
Kendall's Tau between 2011-2015 and 2016-2020: 0.4884 (p-value: 0.0000), Sample Size: 80
93 93
Kendall's Tau between 2016-2020 and 2021-2025: 0.3879 (p-value: 0.0000), Sample Size: 93


Unnamed: 0,period_A,period_B,kendall_tau,p_value,sample_size
0,-2000,2001-2005,0.271976,0.08203575,23
1,2001-2005,2006-2010,0.137071,0.2831767,35
2,2006-2010,2011-2015,0.568368,7.671393e-08,55
3,2011-2015,2016-2020,0.488399,5.803774e-08,80
4,2016-2020,2021-2025,0.38793,4.828774e-06,93


In [102]:
import plotly.graph_objects as go
import plotly.subplots as sp

# Node-wise betweenness scatter plot
fig_btw = px.scatter(
    all_top_betweenness_df,
    x="period", y="score",
    color="color_group",
    hover_data=["node", "kind", "score"],
    category_orders={"period": period_order},
    title="Betweenness distribution per period",
    color_discrete_map=palette,
    size_max=30,
    size=None  # All points same size
)
fig_btw.update_traces(marker=dict(size=30, opacity=0.7))  # Force all points to 30px
fig_btw.update_layout(template=template_type, showlegend=False)


fig = go.Figure(fig_btw)

# --- Inset: add Kendall's tau bars routed to x2/y2 ---
fig.add_bar(
    x=kendall_tau_df['period_B'],
    y=kendall_tau_df['kendall_tau'],
    name="œÑ (all nodes)",
    marker_color="rgba(27,158,119,1)",
    xaxis="x2", yaxis="y2",
    hovertemplate="Period: %{x}<br>œÑ: %{y:.3f}<extra></extra>"
)
fig.add_bar(
    x=subset_kendall_tau_df['period_B'],
    y=subset_kendall_tau_df['kendall_tau'],
    name="œÑ (subset)",
    marker_color="rgba(217,95,2,1)",
    xaxis="x2", yaxis="y2",
    hovertemplate="Period: %{x}<br>œÑ (subset): %{y:.3f}<extra></extra>"
)

# --- New, smaller inset on the LEFT ---
L, B, R, T = 0.06, 0.62, 0.38, 0.92   # ‚Üê left, bottom, right, top (paper coords 0‚Äì1)

fig.update_layout(
    # smaller background card (optional)
    shapes=[dict(
        type="rect", xref="paper", yref="paper",
        x0=L-0.02, y0=B-0.02, x1=R+0.01, y1=T+0.05,
        line=dict(width=1), fillcolor="rgba(0,0,0,0)"
    )],
    xaxis2=dict(
        domain=[L, R], anchor="y2",
        categoryorder="array", categoryarray=period_order,
        tickfont=dict(size=8),
        tickangle=-45,
        showgrid=False, showline=True, zeroline=False
    ),
    yaxis2=dict(
        domain=[B, T], anchor="x2",
        # title="œÑ", 
        tickfont=dict(size=8),
        range=[0, 1],
        showgrid=True, gridwidth=0.5, zeroline=False
    ),
    bargap=0.25,  # tighten bars in the small inset
    showlegend=True,  # legend only for the main plot (not inset
)

# Optional: keep the inset stable and unobtrusive
fig.update_xaxes(fixedrange=True, selector=dict(anchor="y2"))
fig.update_yaxes(fixedrange=True, selector=dict(anchor="x2"))
fig.update_traces(opacity=0.85, selector=dict(xaxis="x2", yaxis="y2"))  # slight transparency

fig.show()


# ----------------------------------------------

import plotly.express as px

# Edge-wise betweenness scatter plot
fig_ebtw = px.scatter(
    all_edge_betweenness_df,
    x="period", y="edge_betweenness_weighted",
    color="color_group",
    hover_data=["u", "v", "edge_betweenness_weighted"],
    category_orders={"period": period_order},
    title="Edge Betweenness distribution per period",
    color_discrete_map=edge_palette,
    size_max=30,
    size=None
)
fig_ebtw.update_traces(marker=dict(size=30, opacity=0.7))
fig_ebtw.update_layout(template=template_type, showlegend=False)

fig_edge = go.Figure(fig_ebtw)

# --- Inset: add Kendall's tau bars for edges ---
fig_edge.add_bar(
    x=complete_edge_tau_df['period_B'],
    y=complete_edge_tau_df['kendall_tau'],
    name="œÑ (all edges)",
    marker_color="rgba(27,158,119,1)",
    xaxis="x2", yaxis="y2",
    hovertemplate="Period: %{x}<br>œÑ: %{y:.3f}<extra></extra>"
)
fig_edge.add_bar(
    x=subset_edge_kendall_tau_df['period_B'],
    y=subset_edge_kendall_tau_df['kendall_tau'],
    name="œÑ (subset edges)",
    marker_color="rgba(217,95,2,1)",
    xaxis="x2", yaxis="y2",
    hovertemplate="Period: %{x}<br>œÑ (subset): %{y:.3f}<extra></extra>"
)

# --- Inset layout ---
fig_edge.update_layout(
    shapes=[dict(
        type="rect", xref="paper", yref="paper",
        x0=L-0.02, y0=B-0.02, x1=R+0.01, y1=T+0.05,
        line=dict(width=1), fillcolor="rgba(0,0,0,0)"
    )],
    xaxis2=dict(
        domain=[L, R], anchor="y2",
        categoryorder="array", categoryarray=period_order,
        tickfont=dict(size=8),
        tickangle=-45,
        showgrid=False, showline=True, zeroline=False
    ),
    yaxis2=dict(
        domain=[B, T], anchor="x2",
        tickfont=dict(size=8),
        range=[0, 1],
        showgrid=True, gridwidth=0.5, zeroline=False
    ),
    bargap=0.25,
    showlegend=True,
)

fig_edge.update_xaxes(fixedrange=True, selector=dict(anchor="y2"))
fig_edge.update_yaxes(fixedrange=True, selector=dict(anchor="x2"))
fig_edge.update_traces(opacity=0.85, selector=dict(xaxis="x2", yaxis="y2"))

fig_edge.show()

In [None]:
# fig.write_html(f"{path}/results/feature-only-KG/img/betweenness_with_tau_over_time_inset.html")
# fig_edge.write_html(f"{path}/results/feature-only-KG/img/edge_betweenness_with_tau_over_time_inset.html")

## Normalized degree

In [14]:
norm_degree_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/degree_normalized"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/degree_normalized/{file}")
        df['period'] = period
        norm_degree_all[period] = df
all_norm_degree_df = pd.concat(norm_degree_all.values(), ignore_index=True)

In [15]:
all_norm_degree_df

Unnamed: 0,node,kind,degree,triplets,normalized_degree,period
0,Absolute Mobility Measures,Measure,3,2,0.750000,-2000
1,Decomposition / Structural Approaches,Measure,5,4,0.625000,-2000
2,Regression‚Äêbased Measures,Measure,6,6,0.500000,-2000
3,Transition Matrix / Probability Measures,Measure,4,3,0.666667,-2000
4,Others_Measure,Measure,3,2,0.750000,-2000
...,...,...,...,...,...,...
134,"Policy, Institutional, and Geographic Impacts",RqType,12,21,0.285714,2016-2020
135,"Mobility and Non-Income Outcomes (Health, Well...",RqType,6,9,0.333333,2016-2020
136,Perceptions of Mobility and Attitudes,RqType,5,4,0.625000,2016-2020
137,Theoretical and Structural Models,RqType,6,7,0.428571,2016-2020


In [16]:
showlegend = True

all_norm_degree_df["node"] = all_norm_degree_df["node"].apply(norm)
all_norm_degree_df["color_group"] = np.where(all_norm_degree_df["node"].isin(HIGHLIGHTS), all_norm_degree_df["node"], "Other")

node_palette = {name: px.colors.qualitative.Dark2[i % len(px.colors.qualitative.Dark2)] for i, name in enumerate(sorted(set(all_norm_degree_df["color_group"]) - {"Other"}))}
node_palette["Other"] = "#C7C7C7"

max_size = 30
min_size = 6

# def linear_size(rank, max_rank, min_size, max_size):
#     if max_rank == 1:
#         return max_size
#     return max_size - (rank - 1) * (max_size - min_size) / (max_rank - 1)

# # Compute size per period
# all_norm_degree_df["degree_rank"] = all_norm_degree_df.groupby("period")["degree"].rank(ascending=False, method="min")

# all_norm_degree_df["size"] = all_norm_degree_df.groupby("period").apply(
#     lambda g: g["degree_rank"].apply(lambda r: linear_size(r, g["degree_rank"].max(), min_size, max_size))
# ).reset_index(level=0, drop=True)

all_norm_degree_df["size_triplet"] = all_norm_degree_df.groupby("period").apply(
    lambda g: g["triplets"].apply(lambda r: linear_size(r, g["triplets"].max(), min_size, max_size))).reset_index(level=0, drop=True)

# ------------------------------

fig_nd = px.scatter(
    all_norm_degree_df,
    x="period", y="normalized_degree",
    size = "size_triplet", # ‚ö†Ô∏è Change the size to the previous ranks?
    color = "color_group",
    hover_data=["node","triplets","normalized_degree"],
    category_orders={"period": period_order},
    title="Normalized degree per period (size ‚àù degree)",
    color_discrete_map=palette
)
fig_nd.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )

fig_nd.show()





In [17]:
all_norm_degree_df['normalized_degree'].describe(), all_norm_degree_df['triplets'].describe()

(count    139.000000
 mean       0.540405
 std        0.273227
 min        0.200000
 25%        0.294786
 50%        0.444444
 75%        0.750000
 max        1.000000
 Name: normalized_degree, dtype: float64,
 count    139.000000
 mean      10.683453
 std       10.587137
 min        1.000000
 25%        2.000000
 50%        7.000000
 75%       17.500000
 max       42.000000
 Name: triplets, dtype: float64)

In [None]:
import plotly.graph_objects as go
import plotly.subplots as sp

norm_plt = all_norm_degree_df.sort_values(by=["period","degree"], ascending=[True,False])

fig = sp.make_subplots(rows=2, cols=3, subplot_titles=period_order)

for idx, period in enumerate(period_order):
    if period not in norm_plt['period'].values:
        continue

    period_data = norm_plt[norm_plt['period'] == period]

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1
    
    # Add scatter points
    fig.add_trace(go.Scatter(
        x=period_data['triplets'], 
        y=period_data['normalized_degree'],
        mode='markers', 
        marker=dict(
            size=8,
            color=[palette.get(cg, "#C7C7C7") for cg in period_data['color_group']]
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "#Triplets: %{x}<br>" +
            "Normalized_degree: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['normalized_degree']), axis=-1),
        showlegend=False
    ), row=row, col=col)  # Added row and col here
    
    # Fit a powerlaw: normalized_degree = a * triplets^b
    # Only use points with triplets > 0 and normalized_degree > 0
    mask = (period_data['triplets'] > 0) & (period_data['normalized_degree'] > 0)
    x = period_data.loc[mask, 'triplets'].to_numpy()
    y = period_data.loc[mask, 'normalized_degree'].to_numpy()
    if len(x) > 1 and len(y) > 1:
        # log-log fit: log(y) = log(a) + b*log(x)
        logx = np.log(x)
        logy = np.log(y)
        # b, loga = np.polyfit(logx, y, 1)
        b, loga = np.polyfit(logx, logy, 1)
        a = np.exp(loga)
        x_trend = np.linspace(x.min(), x.max(), 100)
        y_trend = a * x_trend ** b
    else:
        x_trend = np.array([])
        y_trend = np.array([])
        
    # Add trendline
    fig.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=2),
            showlegend=False,
            hoverinfo='skip'
        ),
        row=row, col=col
    )

fig.update_layout(
    template=template_type, 
    showlegend=showlegend,
    title_text="Normalized Degree vs Number of Triplets by period",

)

fig.update_xaxes(type="log", autorange=False, range = [0,2])
fig.update_yaxes(type="log", title_text="Normalized Degree", range=[-1, 0])
fig.update_xaxes(title_text="Triplets", row=2)

fig.update_layout(template='none', showlegend=showlegend, 
                    #  plot_bgcolor = "rgba(0,0,0,0)", 
                    #  paper_bgcolor = "rgba(0,0,0,0)"
                     )

fig.show()

## Zooming into x=[0,10] -> Nothing found

In [47]:
import plotly.graph_objects as go
import plotly.subplots as sp

norm_plt = all_norm_degree_df.sort_values(by=["period","degree"], ascending=[True,False])

fig = sp.make_subplots(rows=2, cols=3, subplot_titles=period_order)

norm_plt = norm_plt[norm_plt['triplets'] <= 10]

for idx, period in enumerate(period_order):
    if period not in norm_plt['period'].values:
        continue

    period_data = norm_plt[norm_plt['period'] == period]

    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1
    
    # Add scatter points
    fig.add_trace(go.Scatter(
        x=period_data['triplets'], 
        y=period_data['normalized_degree'],
        mode='markers',
        marker=dict(
            size=8,
            color=[palette.get(cg, "#C7C7C7") for cg in period_data['color_group']]
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "#Triplets: %{x}<br>" +
            "Normalized_degree: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['normalized_degree']), axis=-1),
        showlegend=False
    ), row=row, col=col)  # Added row and col here
    
    # Calculate OLS fit
    z = np.polyfit(period_data['triplets'], period_data['normalized_degree'], 1)
    p = np.poly1d(z)
    x_trend = np.linspace(period_data['triplets'].min(), period_data['triplets'].max(), 100)
    y_trend = p(x_trend)
    
    # Add trendline
    fig.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=2),
            showlegend=False,
            # Add annotation for slope/intercept as text on plot
            name=f"Slope: {z[0]:.4f}, Intercept: {z[1]:.4f}",
            # No hovertext
        ),
        row=row, col=col
    )
    # Add annotation to subplot
    fig.add_annotation(
        text=f"Slope: {z[0]:.4f}<br>Intercept: {z[1]:.4f}",
        xref=f"x{idx+1}" if idx > 0 else "x",
        yref=f"y{idx+1}" if idx > 0 else "y",
        x=period_data['triplets'].max(),
        y=period_data['normalized_degree'].min(),
        showarrow=False,
        font=dict(size=12, color="DarkSlateGrey"),
        align="left",
        row=row, col=col
    )

fig.update_layout(
    template=template_type, 
    showlegend=showlegend,
    title_text="Normalized Degree vs Number of Triplets by period",

)

fig.update_xaxes(range=[0, 11])
fig.update_xaxes(title_text="Triplets", row=2)
fig.update_yaxes(title_text="Normalized Degree", range=[0, 1.1], dtick=0.2)

fig.update_layout(template=template_type, showlegend=showlegend, 
                    #  plot_bgcolor = "rgba(0,0,0,0)", 
                    #  paper_bgcolor = "rgba(0,0,0,0)"
                     )

fig.show()

# Resurgence