# 0. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'üìÇ Current working directory: {path}')
print(f'üíö Today is {today}')
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))
import ss_api_call as ss

üìÇ Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
üíö Today is 2025-10-03


# 1. Feature-Only KG in Neo4j

In [124]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

In [115]:
df.isna().sum()

id                           0
title                        0
year                         0
doi                         87
landing_page                 1
abstract_inverted_index     33
language                     0
is_oa                        0
oa_status                    0
oa_link                    295
abstract                    33
abstract_sm                104
authors_sm                 508
domain                       2
sort_gpt_1                  34
sort_gpt_2                 516
sort_gpt4o_1                33
sort_gpt4o_2               516
cited_by                   139
len_cited_by                 0
ref_count                    0
cited_by_count               0
Q1                           0
Q1_1                         0
Q2                           0
Q2_1                        62
Q2_2                       555
Q3                          55
Q4                         538
abs                          0
index                        0
category_n1                  0
measure 

In [125]:
df['period'] = pd.cut(df['year'], bins=[1900, 2000, 2005, 2010, 2015, 2020, 2025], right=True, labels=["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"])
df['period'].value_counts().sort_index()

period
-2000         23
2001-2005     23
2006-2010     65
2011-2015    141
2016-2020    183
2021-2025    182
Name: count, dtype: int64

In [17]:
import pandas as pd
from neo4j import GraphDatabase

# --- Connect ---
driver = GraphDatabase.driver("bolt://localhost:7690", auth=("neo4j", "your_password"))

# Normalize categories
df['category_1'] = df['category_1'].replace({'Others': 'Others_Measure'})
df['category_2'] = df['category_2'].replace({'Others': 'Others_Measure'})
df['data_cat']   = df['data_cat'].replace({'Others': 'Others_DataType'})
df['rq_cat']     = df['rq_cat'].replace({'Others': 'Others_RqType'})

def safe_str(val):
    if pd.isna(val) or str(val).strip().lower() in {"", "nan", "none"}:
        return None
    return str(val).strip()

# --- Constraints ---
with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Measure)   REQUIRE m.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (ds:DataType) REQUIRE ds.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:RqType)    REQUIRE r.name IS UNIQUE")

# --- Collect rows ---
rows = []
for _, row in df.iterrows():
    m1       = safe_str(row.get("category_1"))
    # m2       = safe_str(row.get("category_2"))
    datatype = safe_str(row.get("data_cat"))
    rqtype   = safe_str(row.get("rq_cat"))
    period = row.get("period")
    paper_id = safe_str(row.get("id"))

    if not all([m1, datatype, rqtype, period]):
        continue  # skip incomplete rows

    rows.append({
        "m1": m1,
        # "m2": m2,
        "datatype": datatype,
        "rqtype": rqtype,
        "period": period,
        "paper_id": paper_id
    })

# --- Cypher with undirected edges ---
cypher = """
UNWIND $rows AS row

MERGE (m1:Measure {name: row.m1})
MERGE (ds:DataType {name: row.datatype})
MERGE (rq:RqType {name: row.rqtype})

// Measure -- DataType
MERGE (m1)-[r1:CO_MEASURE_DATATYPE {period: row.period}]-(ds)
  ON CREATE SET r1.w = 1, r1.papers=[row.paper_id]
  ON MATCH  SET r1.w = r1.w + 1, r1.papers = apoc.coll.toSet(coalesce(r1.papers, []) + row.paper_id)

// DataType -- RqType
MERGE (ds)-[r2:CO_DATATYPE_RQTYPE {period: row.period}]-(rq)
  ON CREATE SET r2.w = 1, r2.papers=[row.paper_id]
  ON MATCH  SET r2.w = r2.w + 1, r2.papers = apoc.coll.toSet(coalesce(r2.papers, []) + row.paper_id)

// RqType -- Measure
MERGE (rq)-[r3:CO_RQTYPE_MEASURE {period: row.period}]-(m1)
  ON CREATE SET r3.w = 1, r3.papers=[row.paper_id]
  ON MATCH  SET r3.w = r3.w + 1, r3.papers = apoc.coll.toSet(coalesce(r3.papers, []) + row.paper_id)

"""

with driver.session() as session:
    if rows:
        session.run(cypher, rows=rows)

driver.close()


# 2. Centrality Measures

In [119]:
# pip install neo4j pandas
from neo4j import GraphDatabase
import pandas as pd

# --- CONFIG ---
NEO4J_URI  = "bolt://localhost:7690"
NEO4J_AUTH = ("neo4j", "your_password")
GRAPH_NAME = "features"

REL_TYPES = [
    "CO_MEASURE_DATATYPE",
    "CO_DATATYPE_RQTYPE",
    "CO_RQTYPE_MEASURE"
]

# --- UTILS ---
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

def run_df(cypher, params=None):
    """Run Cypher and return pandas DataFrame."""
    with driver.session() as s:
        result = s.run(cypher, params or {})
        data = result.data()
    return pd.DataFrame(data)

def run(cypher, params=None):
    with driver.session() as s:
        s.run(cypher, params or {})
        
def graph_name_for_period(period: str) -> str:
    # sanitize if periods contain spaces etc.; adjust if needed
    return f"{GRAPH_NAME}__{period}"

# --- 1) GDS PROJECTION (UNDIRECTED, WEIGHTED) ---
def ensure_graph_projection():
    # Drop if exists
    run("CALL gds.graph.drop($name, false) YIELD graphName", {"name": GRAPH_NAME})

    cypher = """
    CALL gds.graph.project.cypher(
      $name,
      // Nodes
      'MATCH (n:Measure|DataType|RqType)
       RETURN id(n) AS id, labels(n) AS labels',
      // Relationships (all three types), undirected, with w and inv_w as top-level props
      'MATCH (a)-[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-(b)
       RETURN id(a) AS source,
              id(b) AS target,
              type(r) AS type,
              coalesce(r.w, 1.0) AS w,
              CASE WHEN coalesce(r.w,0) > 0 THEN 1.0 / r.w ELSE 1e12 END AS inv_w,
              "UNDIRECTED" AS orientation'
    )
    YIELD graphName, nodeCount, relationshipCount
    """
    df = run_df(cypher, {"name": GRAPH_NAME})
    print(df)


def period_graph_projection(period=str):
    name = graph_name_for_period(period)
    # Drop if exists
    run("CALL gds.graph.drop($name, false) YIELD graphName", {"name": name})

    cypher = """
    CALL gds.graph.project.cypher(
      $name,
      // Nodes
      'MATCH (n:Measure|DataType|RqType)
       RETURN id(n) AS id, labels(n) AS labels',
      // Relationships (all three types), undirected, with w and inv_w as top-level props
      'MATCH (a)-[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-(b)
       WHERE r.period = $period
       RETURN id(a) AS source,
              id(b) AS target,
              type(r) AS type,
              coalesce(r.w, 1.0) AS w,
              CASE WHEN coalesce(r.w,0) > 0 THEN 1.0 / r.w ELSE 1e12 END AS inv_w,
              "UNDIRECTED" AS orientation',
      { parameters: {period: $period} }
    )
    YIELD graphName, nodeCount, relationshipCount
    """
    df = run_df(cypher, {"name": name, "period": period})
    print(df)


# --- 2) METRICS (STREAM) ---
def top_degree(name: str):
    cypher = f"""
    CALL gds.degree.stream($name)
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": name})

def top_strength(name: str):
    cypher = f"""
    CALL gds.degree.stream($name, {{relationshipWeightProperty:'w'}})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score AS strength
    ORDER BY strength DESC
    """
    return run_df(cypher, {"name": name})

def top_betweenness(name: str, weight_prop: str | None = None):
    opts = "{}" if weight_prop is None else f"{{relationshipWeightProperty:'{weight_prop}'}}"
    cypher = f"""
    CALL gds.betweenness.stream($name, {opts})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": name})

def hops_count(name: str):
    cypher = f"""
    CALL gds.allShortestPaths.stream($name)
    YIELD sourceNodeId, targetNodeId, distance
    RETURN gds.util.asNode(sourceNodeId).name AS source,
        gds.util.asNode(targetNodeId).name AS target,
        distance AS hops
    ORDER BY hops DESC
    """
    return run_df(cypher, {"name": name})

# def louvain():
#     cypher = f"""
#     CALL gds.louvain.stream($name, {{relationshipWeightProperty:'w'}})
#     YIELD nodeId, communityId
#     WITH communityId, gds.util.asNode(nodeId) AS n
#     RETURN communityId,
#            collect(n.name)[0..10] AS sample_members,
#            count(*) AS size
#     ORDER BY size DESC, communityId ASC
#     """
#     return run_df(cypher, {"name": GRAPH_NAME})



In [20]:
if __name__ == "__main__":
    ensure_graph_projection()
    top_degree_df = top_degree(name=GRAPH_NAME)
    top_strength_df = top_strength(name=GRAPH_NAME)
    top_betweenness_df = top_betweenness(name=GRAPH_NAME, weight_prop='inv_w')
    hops_count_df = hops_count(name=GRAPH_NAME)
    # louvain_df = louvain()
    top_betweenness_noweight_df = top_betweenness(name=GRAPH_NAME, weight_prop=None)

    # Triplets to DataFrame / CSV
    # trips = get_triplets(limit=50)
    # print("\nSample triplets:")
    # print(trips.head(10))
    # export_triplets_csv("triplets.csv")

    driver.close()



  graphName  nodeCount  relationshipCount
0  features         30                966


In [21]:
top_betweenness_df.to_csv(f"{path}/results/feature-only-KG/top_betweenness_{today}.csv")
top_degree_df.to_csv(f"{path}/results/feature-only-KG/top_degree_{today}.csv")
top_strength_df.to_csv(f"{path}/results/feature-only-KG/top_strength_{today}.csv")
hops_count_df.to_csv(f"{path}/results/feature-only-KG/hops_count_{today}.csv")
# louvain_df.to_csv(f"{path}/results/feature-only-KG/louvain.csv")
top_betweenness_noweight_df.to_csv(f"{path}/results/feature-only-KG/top_betweenness_noweight_{today}.csv")

In [5]:
from neo4j import GraphDatabase
import networkx as nx
import pandas as pd
from collections import defaultdict

# --- CONFIG ---
NEO4J_URI  = "bolt://localhost:7690"
NEO4J_AUTH = ("neo4j", "your_password")
REL_TYPES  = "CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE"  # undirected semantics

driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

def run_df(cypher, params=None):
    with driver.session() as s:
        data = s.run(cypher, params or {}).data()
    return pd.DataFrame(data)

# --- 0) Pull nodes & edges from Neo4j ---
# nodes: name + first label as "kind"
nodes_df = run_df("""
MATCH (n:Measure) RETURN n.name AS name, 'Measure' AS kind
UNION ALL
MATCH (n:DataType) RETURN n.name AS name, 'DataType' AS kind
UNION ALL
MATCH (n:RqType) RETURN n.name AS name, 'RqType' AS kind
""")

edges_df = run_df(f"""
MATCH (a)-[r:{REL_TYPES}]-(b)
RETURN a.name AS u, labels(a)[0] AS kind_u,
       b.name AS v, labels(b)[0] AS kind_v,
       coalesce(r.w,1) AS w, id(r) AS rel_id, type(r) AS rel_type
""")

# --- 1) Build undirected weighted graph in NetworkX ---
G = nx.Graph()
for _, row in nodes_df.iterrows():
    G.add_node(row["name"], kind=row["kind"])

for _, row in edges_df.iterrows():
    u, v = row["u"], row["v"]
    w    = float(row["w"])
    # accumulate weight if duplicates
    if G.has_edge(u, v):
        G[u][v]["weight"] += w
        G[u][v]["rel_ids"].append(row["rel_id"])
        G[u][v]["rel_types"].add(row["rel_type"])
    else:
        G.add_edge(u, v,
                   weight=w,
                   rel_ids=[row["rel_id"]],
                   rel_types={row["rel_type"]})

# --- 2a) Edge Betweenness Centrality (unweighted hops) ---
# Treat all edges equally for "fewest hops" bridges
edge_betweenness = nx.edge_betweenness_centrality(G, normalized=True)  # dict[(u,v)] -> score
edge_betweenness_df = pd.DataFrame(
    [(u, v, score) for (u, v), score in edge_betweenness.items()],
    columns=["u", "v", "edge_betweenness"]
)

# --- 2b) Edge Betweenness Centrality (weighted, using inverted weights) ---
# Assign inverted weight as 'length' attribute for shortest paths
for u, v, data in G.edges(data=True):
    data["length"] = 1.0 / data["weight"]

edge_betweenness_w = nx.edge_betweenness_centrality(G, normalized=True, weight="length")
edge_betweenness_w_df = pd.DataFrame(
    [(u, v, score) for (u, v), score in edge_betweenness_w.items()],
    columns=["u", "v", "edge_betweenness_weighted"]
)

# --- 3) Jaccard similarity (overlap of neighborhoods) for existing edges ---
def jaccard_for_edge(G, u, v):
    Nu = set(G.neighbors(u))
    Nv = set(G.neighbors(v))
    inter = Nu & Nv
    union = Nu | Nv
    return (len(inter) / len(union)) if union else 0.0

edge_jaccard = {(u, v): jaccard_for_edge(G, u, v) for u, v in G.edges()}
edge_jaccard_df = pd.DataFrame(
    [(u, v, score) for (u, v), score in edge_jaccard.items()],
    columns=["u", "v", "edge_jaccard"]
)




In [None]:
# edge_betweenness_df.to_csv(f"{path}/results/feature-only-KG/edge_betweenness.csv")
# edge_jaccard_df.to_csv(f"{path}/results/feature-only-KG/edge_jaccard.csv")
# edge_betweenness_w_df.to_csv(f"{path}/results/feature-only-KG/edge_betweenness_weighted.csv")

In [36]:
import networkx as nx

def edge_betweenness_per_period(period: str) -> pd.DataFrame:
    # Pull edges for the given period
    edges_df = run_df(f"""
    MATCH (a)-[r:{'|'.join(REL_TYPES)}]-(b)
    WHERE r.period = $period
    RETURN a.name AS u, labels(a)[0] AS kind_u,
           b.name AS v, labels(b)[0] AS kind_v,
           coalesce(r.w,1) AS w, id(r) AS rel_id, type(r) AS rel_type
    """, { "period": period })

    # Pull nodes (all types)
    nodes_df = run_df("""
    MATCH (n:Measure) RETURN n.name AS name, 'Measure' AS kind
    UNION ALL
    MATCH (n:DataType) RETURN n.name AS name, 'DataType' AS kind
    UNION ALL
    MATCH (n:RqType) RETURN n.name AS name, 'RqType' AS kind
    """)

    # Build graph
    G = nx.Graph()
    for _, row in nodes_df.iterrows():
        G.add_node(row["name"], kind=row["kind"])
    for _, row in edges_df.iterrows():
        u, v = row["u"], row["v"]
        w    = float(row["w"])
        if G.has_edge(u, v):
            G[u][v]["weight"] += w
            G[u][v]["rel_ids"].append(row["rel_id"])
            G[u][v]["rel_types"].add(row["rel_type"])
        else:
            G.add_edge(u, v,
                       weight=w,
                       rel_ids=[row["rel_id"]],
                       rel_types={row["rel_type"]})

    # Unweighted betweenness
    edge_betweenness = nx.edge_betweenness_centrality(G, normalized=True)
    edge_betweenness_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness.items()],
        columns=["u", "v", "edge_betweenness"]
    )

    # Weighted betweenness (inverse weight as length)
    for u, v, data in G.edges(data=True):
        data["length"] = 1.0 / data["weight"]
    edge_betweenness_w = nx.edge_betweenness_centrality(G, normalized=True, weight="length")
    edge_betweenness_w_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness_w.items()],
        columns=["u", "v", "edge_betweenness_weighted"]
    )

    # Merge and return
    return edge_betweenness_df.merge(edge_betweenness_w_df, on=["u", "v"], how="outer")


In [37]:
for p in df['period'].cat.categories:   # your 6 categorical periods
    period_df = edge_betweenness_per_period(p)
    os.makedirs(f"{path}/results/feature-only-KG/periods/edge_betweenness", exist_ok=True)
    period_df.to_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{p}_{today}.csv")




# 2. Temporal

In [22]:
df['period'].cat.categories

Index(['-2000', '2001-2005', '2006-2010', '2011-2015', '2016-2020',
       '2021-2025'],
      dtype='object')

In [27]:
for p in df['period'].cat.categories:
    print(f"\n--- Period: {p} ---")
    period_graph_projection(period=p)
    gname = graph_name_for_period(p)
    top_degree_df = top_degree(name=gname)
    top_strength_df = top_strength(name=gname)
    top_betweenness_df = top_betweenness(name=gname, weight_prop='inv_w')
    hops_count_df = hops_count(name=gname)
    top_betweenness_noweight_df = top_betweenness(name=gname, weight_prop=None)

    os.makedirs(f"{path}/results/feature-only-KG/periods", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_betweenness", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_degree", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_strength", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/hops_count", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_betweenness_noweight", exist_ok=True)

    top_betweenness_df.to_csv(f"{path}/results/feature-only-KG/periods/top_betweenness/{p}_{today}.csv")
    top_degree_df.to_csv(f"{path}/results/feature-only-KG/periods/top_degree/{p}_{today}.csv")
    top_strength_df.to_csv(f"{path}/results/feature-only-KG/periods/top_strength/{p}_{today}.csv")
    hops_count_df.to_csv(f"{path}/results/feature-only-KG/periods/hops_count/{p}_{today}.csv")
    top_betweenness_noweight_df.to_csv(f"{path}/results/feature-only-KG/periods/top_betweenness_noweight/{p}_{today}.csv")




--- Period: -2000 ---
         graphName  nodeCount  relationshipCount
0  features__-2000         30                 64

--- Period: 2001-2005 ---




             graphName  nodeCount  relationshipCount
0  features__2001-2005         30                 82

--- Period: 2006-2010 ---
             graphName  nodeCount  relationshipCount
0  features__2006-2010         30                128





--- Period: 2011-2015 ---
             graphName  nodeCount  relationshipCount
0  features__2011-2015         30                206

--- Period: 2016-2020 ---
             graphName  nodeCount  relationshipCount
0  features__2016-2020         30                240





--- Period: 2021-2025 ---
             graphName  nodeCount  relationshipCount
0  features__2021-2025         30                246


## Degree/2*triplets

In [126]:
# ‚ö†Ô∏è MAKE SURE that df is the one under 0.Libraries
# Run 2.Centrality Measures

cypher = """
// For Measures
MATCH (m:Measure)-[:CO_MEASURE_DATATYPE {period:$period}]-(d:DataType)
      -[:CO_DATATYPE_RQTYPE {period:$period}]-(r:RqType)
      -[:CO_RQTYPE_MEASURE {period:$period}]-(m)
WITH m, count(DISTINCT [m,d,r]) AS triplets
MATCH (m)-[rel {period:$period}]-(n)
WITH m.name AS node, 'Measure' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree

UNION ALL
// For DataTypes
MATCH (d:DataType)-[:CO_DATATYPE_RQTYPE {period:$period}]-(r:RqType)
      -[:CO_RQTYPE_MEASURE {period:$period}]-(m:Measure)
      -[:CO_MEASURE_DATATYPE {period:$period}]-(d)
WITH d, count(DISTINCT [m,d,r]) AS triplets
MATCH (d)-[rel {period:$period}]-(n)
WITH d.name AS node, 'DataType' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree

UNION ALL
// For RqTypes
MATCH (r:RqType)-[:CO_RQTYPE_MEASURE {period:$period}]-(m:Measure)
      -[:CO_MEASURE_DATATYPE {period:$period}]-(d:DataType)
      -[:CO_DATATYPE_RQTYPE {period:$period}]-(r)
WITH r, count(DISTINCT [m,d,r]) AS triplets
MATCH (r)-[rel {period:$period}]-(n)
WITH r.name AS node, 'RqType' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree
"""



for p in df['period'].cat.categories:
    print(f"\n--- Period: {p} ---")
    write_table = pd.DataFrame()
    period = p
    df = run_df(cypher, {"period": period})
    write_table = pd.concat([write_table, df], ignore_index=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/degree_normalized", exist_ok=True)
    write_table.to_csv(f"{path}/results/feature-only-KG/periods/degree_normalized/{p}_{today}.csv", index=False)




--- Period: -2000 ---

--- Period: 2001-2005 ---

--- Period: 2006-2010 ---

--- Period: 2011-2015 ---

--- Period: 2016-2020 ---

--- Period: 2021-2025 ---


# 4. Visualization

## Top Degree & Top Strength

In [4]:
top_degree_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_degree"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_degree/{file}")
        df['period'] = period
        top_degree_all[period] = df
all_top_degree_df = pd.concat(top_degree_all.values(), ignore_index=True)


top_strength_all = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_strength"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_strength/{file}")
        df['period'] = period
        top_strength_all[period] = df
all_top_strength_df = pd.concat(top_strength_all.values(), ignore_index=True)

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import re
import unicodedata
import plotly.graph_objects as go
import plotly.subplots as sp


template_type = "none"
showlegend = True

def norm(s):
    if pd.isna(s): return s
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"[\u2010\u2011\u2012\u2013\u2014\u2212]", "-", s)  # dash variants ‚Üí "-"
    s = re.sub(r"\s+", " ", s).strip()
    return s

deg_df = all_top_degree_df.copy()
str_df = all_top_strength_df.copy()

if "strength" in str_df.columns:
    str_df = str_df.rename(columns={"strength": "score"})

deg_df["metric"] = "degree"
str_df["metric"] = "strength"

long_df = pd.concat(
    [deg_df[["kind","node","score","period","metric"]],
     str_df[["kind","node","score","period","metric"]]],
    ignore_index=True
)

long_df["node"] = long_df["node"].apply(norm)

# Period order
period_order = ["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"]
long_df["period"] = pd.Categorical(long_df["period"], categories=period_order, ordered=True)

# ---- choose which nodes to color; everything else will be grey ----
HIGHLIGHTS = {"Intergenerational Wealth Mobility and Inheritance", "Regression-based Measures", "Empirical Estimates and Determinants", "Panel/Longitudinal Surveys"}

# Build a color group: each highlighted node gets its own label; all others -> "Other"
long_df["color_group"] = np.where(long_df["node"].isin(HIGHLIGHTS), long_df["node"], "Other")

# Assign Dark2 colors to highlighted nodes; grey for "Other"
palette = {name: px.colors.qualitative.Dark2[i % len(px.colors.qualitative.Dark2)] for i, name in enumerate(sorted(HIGHLIGHTS))}
palette["Other"] = "#C7C7C7"

# ---------------------------
# Helpers: size scaling
# ---------------------------
def scaled_size(series, min_size=6, max_size=28, transform=None):
    """Return a numpy array of marker sizes from a pandas Series."""
    x = series.astype(float).to_numpy()
    if transform is not None:
        x = transform(x)
    if x.size == 0:
        return x
    lo, hi = np.nanmin(x), np.nanmax(x)
    if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo < 1e-12:
        return np.full_like(x, (min_size + max_size) / 2.0)
    return min_size + (x - lo) / (hi - lo) * (max_size - min_size)

# Make per-metric size columns (global scaling across periods for comparability)
deg_mask = long_df["metric"] == "degree"
str_mask = long_df["metric"] == "strength"

sizes_deg = pd.Series(index=long_df.index, dtype=float)
sizes_str = pd.Series(index=long_df.index, dtype=float)

sizes_deg.loc[deg_mask] = scaled_size(long_df.loc[str_mask, "score"], min_size=6, max_size=28, transform=None)
# strength: use sqrt to compress heavy tail
sizes_str.loc[str_mask] = scaled_size(long_df.loc[deg_mask, "score"], min_size=6, max_size=28, transform=None)

long_df["size_deg"] = sizes_deg
long_df["size_str"] = sizes_str

# ------------------------------
# A) Beeswarm / strip: Degree over periods (size by degree)
# ------------------------------
df_deg = long_df[deg_mask].copy()

fig_degree_strip = px.scatter(
    df_deg,
    x="period", y="score",
    color="color_group",
    size = "size_deg",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    color_discrete_map=palette,
    title="Degree distribution per period (size ‚àù Strength)" # intentional: size by strength for degree plot
)
fig_degree_strip.update_layout(
    template=template_type, 
    showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)", 
    # paper_bgcolor = "rgba(0,0,0,0)",
    )

# ------------------------------
# B) Beeswarm / strip: Strength over periods 
# ------------------------------
df_str = long_df[str_mask].copy()

fig_strength_strip = px.scatter(
    df_str,
    x="period", y="score",
    color="color_group",
    size = "size_str",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    color_discrete_map=palette,
    title="Strength distribution per period (size ‚àù Degree)", # intentional: size by degree for strength plot
    # log_y=True,
)

fig_strength_strip.update_layout(
    yaxis_title="Strength (Œ£ weights)",
    legend_title_text="Node (highlighted only)"
)

fig_strength_strip.update_layout(
    template=template_type, 
    showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)", 
    # paper_bgcolor = "rgba(0,0,0,0)"
    )

# ------------------------------
# C) Strength vs Degree scatter, faceted by period (optional)
# ------------------------------
deg_p = long_df[deg_mask][["period","node","kind","score","color_group"]].rename(columns={"score":"degree"})
str_p = long_df[str_mask][["period","node","score"]].rename(columns={"score":"strength"})
sd = deg_p.merge(str_p, on=["period","node"], how="inner")

sd['log_y+1']= np.log10(sd["strength"] + 1)
sd = sd.sort_values(by=["period","degree"], ascending=[True,False])

# For quick plotting with built-in trendline (multiple per facet, easier but not flexible for coloring and trendline setting)
fig_sd_simple = px.scatter(
    sd, x="degree", y="log_y+1",
    # color="color_group",
    # color_discrete_map=palette,
    facet_col="period", facet_col_wrap=3,
    hover_data=["node","kind"],
    title="Strength vs Degree by period",
    category_orders={"period": period_order},
    trendline="ols",
    trendline_color_override="DarkSlateGrey",
)


fig_sd = sp.make_subplots(rows=2, cols=3, subplot_titles=period_order)

for idx, period in enumerate(period_order):
    if period not in sd['period'].values:
        continue
        
    period_data = sd[sd['period'] == period]
    
    # Calculate row and col based on period_order index
    col = idx % 3 + 1
    row = idx // 3 + 1
    
    # Add scatter points
    fig_sd.add_trace(go.Scatter(
        x=period_data['degree'], 
        y=period_data['log_y+1'],
        mode='markers',
        marker=dict(
            size=8,
            color=[palette.get(cg, "#C7C7C7") for cg in period_data['color_group']]
        ),
        hovertemplate=
            "Node: %{customdata[0]}<br>" +
            "Kind: %{customdata[1]}<br>" +
            "Degree: %{x}<br>" +
            "Strength: %{customdata[2]}<br>" +
            "<extra></extra>",
        customdata=np.stack((period_data['node'], period_data['kind'], period_data['strength']), axis=-1),
        showlegend=False
    ), row=row, col=col)  # Added row and col here
    
    # Calculate OLS fit
    z = np.polyfit(period_data['degree'], period_data['log_y+1'], 1)
    p = np.poly1d(z)
    x_trend = np.linspace(period_data['degree'].min(), period_data['degree'].max(), 100)
    y_trend = p(x_trend)
    
    # Add trendline
    fig_sd.add_trace(
        go.Scatter(
            x=x_trend, y=y_trend,
            mode='lines',
            line=dict(color='DarkSlateGrey', width=2),
            showlegend=False,
            hoverinfo='skip',
        ),
        row=row, col=col
    )

fig_sd.update_layout(
    template=template_type, 
    showlegend=showlegend,
    title_text="Strength vs Degree by period",
    
)

fig_sd.update_xaxes(range=[0, 20])
fig_sd.update_xaxes(title_text="degree", row=2)
fig_sd.update_yaxes(title_text="log_y+1")

fig_sd.update_layout(template=template_type, showlegend=showlegend, 
                    #  plot_bgcolor = "rgba(0,0,0,0)", 
                    #  paper_bgcolor = "rgba(0,0,0,0)"
                     )

fig_degree_strip.show()
fig_strength_strip.show()
fig_sd.show()
fig_sd_simple.show()

# fig_degree_strip.write_html(f"{path}/results/feature-only-KG/img/degree_over_time.html")
# fig_strength_strip.write_html(f"{path}/results/feature-only-KG/img/strength_over_time.html")
# fig_sd.write_html(f"{path}/results/feature-only-KG/img/strength_vs_degree.html")

# fig_degree_strip.write_image(f"{path}/results/feature-only-KG/img/degree_over_time_legend.png", width=800, height=600, scale=2)
# fig_strength_strip.write_image(f"{path}/results/feature-only-KG/img/strength_over_time_legend.png", width=800, height=600, scale=2)
# fig_sd.write_image(f"{path}/results/feature-only-KG/img/strength_vs_degree_legend.png", width=1200, height=800, scale=2)


## Node Betweenness & Edge Betweenness

In [89]:
top_betweenness = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/top_betweenness"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/top_betweenness/{file}")
        df['period'] = period
        top_betweenness[period] = df
all_top_betweenness_df = pd.concat(top_betweenness.values(), ignore_index=True)


edge_betweenness = {}
for file in os.listdir(f"{path}/results/feature-only-KG/periods/edge_betweenness"):
    if file.endswith(".csv"):
        period = file.split("_")[0]
        df = pd.read_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{file}")
        df['period'] = period
        edge_betweenness[period] = df
all_edge_betweenness_df = pd.concat(edge_betweenness.values(), ignore_index=True)

In [107]:
import plotly.express as px

hist = px.histogram(
    all_top_betweenness_df,
    x="score",
    nbins=100,
)
hist.show()

In [110]:
showlegend = True

all_top_betweenness_df["node"] = all_top_betweenness_df["node"].apply(norm)
all_top_betweenness_df["color_group"] = np.where(all_top_betweenness_df["node"].isin(HIGHLIGHTS), all_top_betweenness_df["node"], "Other")

all_edge_betweenness_df["u"] = all_edge_betweenness_df["u"].apply(norm)
all_edge_betweenness_df["v"] = all_edge_betweenness_df["v"].apply(norm)
v_highlights = ["Intergenerational Wealth Mobility and Inheritance", "Empirical Estimates and Determinants", "No dataset", "National Survey Data", "Panel/Longitudinal Surveys"]
all_edge_betweenness_df["color_group"] = np.where(all_edge_betweenness_df["u"].isin(["Regression-based Measures", "No dataset"]) & all_edge_betweenness_df["v"].isin(v_highlights), all_edge_betweenness_df["v"], "Other")
edge_palette = {name: px.colors.qualitative.Dark2[i % len(px.colors.qualitative.Dark2)] for i, name in enumerate(sorted(set(all_edge_betweenness_df["color_group"]) - {"Other"}))}
edge_palette["Other"] = "#C7C7C7"

all_top_betweenness_df["rank"] = all_top_betweenness_df.groupby("period")["score"].rank(ascending=False, method="min")
all_edge_betweenness_df["rank"] = all_edge_betweenness_df.groupby("period")["edge_betweenness_weighted"].rank(ascending=False, method="min")
all_top_betweenness_df["prev_rank"] = all_top_betweenness_df.groupby("period")["rank"].shift(1)
all_top_betweenness_df['prev_rank'] = all_top_betweenness_df['prev_rank'].fillna(1)
all_edge_betweenness_df["prev_rank"] = all_edge_betweenness_df.groupby("period")["rank"].shift(1)
all_edge_betweenness_df['prev_rank'] = all_edge_betweenness_df['prev_rank'].fillna(1)

max_size = 30
min_size = 6

def linear_size(rank, max_rank, min_size, max_size):
    if max_rank == 1:
        return max_size
    return max_size - (rank - 1) * (max_size - min_size) / (max_rank - 1)

# Compute size per period
all_top_betweenness_df["size"] = all_top_betweenness_df.groupby("period").apply(
    lambda g: g["prev_rank"].apply(lambda r: linear_size(r, g["prev_rank"].max(), min_size, max_size))
).reset_index(level=0, drop=True)

all_edge_betweenness_df["size"] = all_edge_betweenness_df.groupby("period").apply(
    lambda g: g["prev_rank"].apply(lambda r: linear_size(r, g["prev_rank"].max(), min_size, max_size))
).reset_index(level=0, drop=True)
# ------------------------------

fig_btw = px.scatter(
    all_top_betweenness_df,
    x="period", y="score",
    size = "size", # ‚ö†Ô∏è Change the size to the previous ranks?
    color = "color_group",
    hover_data=["node","kind","score"],
    category_orders={"period": period_order},
    title="Betweenness distribution per period (size ‚àù betweenness)",
    color_discrete_map=palette
)
fig_btw.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )

fig_ebtw = px.scatter(
    all_edge_betweenness_df,
    x="period", y="edge_betweenness_weighted",
    color="color_group",
    size = "size",
    hover_data=["u","v","edge_betweenness_weighted"],
    category_orders={"period": period_order},
    title="Weighted Edge Betweenness distribution per period (size ‚àù weighted edge betweenness)",
    color_discrete_map=edge_palette
)
fig_ebtw.update_layout(
    template=template_type, showlegend=showlegend,
    # plot_bgcolor = "rgba(0,0,0,0)",
    # paper_bgcolor = "rgba(0,0,0,0)"
    )



fig_btw.show()
fig_ebtw.show()

# fig_btw.write_html(f"{path}/results/feature-only-KG/img/betweenness_over_time.html")
# fig_ebtw.write_html(f"{path}/results/feature-only-KG/img/edge_betweenness_over_time.html")
fig_btw.write_image(f"{path}/results/feature-only-KG/img/betweenness_over_time_legend.png", width=800, height=600, scale=2)
fig_ebtw.write_image(f"{path}/results/feature-only-KG/img/edge_betweenness_over_time_legend.png", width=800, height=600, scale=2)

    





