# 0. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'üìÇ Current working directory: {path}')
print(f'üíö Today is {today}')
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))
import ss_api_call as ss

üìÇ Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
üíö Today is 2025-12-10


In [2]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

In [3]:
df.year.describe()

count     617.000000
mean     2015.747164
std         7.390824
min      1976.000000
25%      2012.000000
50%      2018.000000
75%      2021.000000
max      2025.000000
Name: year, dtype: float64

In [23]:
df.columns

Index(['id', 'title', 'year', 'doi', 'landing_page', 'abstract_inverted_index',
       'language', 'is_oa', 'oa_status', 'oa_link', 'abstract', 'abstract_sm',
       'authors_sm', 'domain', 'sort_gpt_1', 'sort_gpt_2', 'sort_gpt4o_1',
       'sort_gpt4o_2', 'cited_by', 'len_cited_by', 'ref_count',
       'cited_by_count', 'Q1', 'Q1_1', 'Q2', 'Q2_1', 'Q2_2', 'Q3', 'Q4', 'abs',
       'index', 'category_n1', 'measure', 'specified', 'relevant',
       'category_n2', 'measure_1', 'measure_2', 'category_3', 'file', 'id.1',
       'Question1', 'Question2', 'rq_cat', 'RQ', 'data_cat', 'data_type',
       'type', 'category_1', 'category_2', 'author_raw_names',
       'author_raw_affiliations', 'topic_display_names', 'topic_scores',
       'field_display_names', 'subfield_display_names'],
      dtype='object')

In [27]:
df[df['title'].str.contains("Where is")][['title', 'category_1','rq_cat', 'data_cat']].values

array([['Where is the land of Opportunity? The Geography of Intergenerational Mobility in the United States *',
        'Rank‚Äêbased Measures', 'Empirical Estimates and Determinants',
        'Linked Administrative Data']], dtype=object)

In [3]:
df.isna().sum()

id                           0
title                        0
year                         0
doi                         87
landing_page                 1
abstract_inverted_index     33
language                     0
is_oa                        0
oa_status                    0
oa_link                    295
abstract                    33
abstract_sm                104
authors_sm                 508
domain                       2
sort_gpt_1                  34
sort_gpt_2                 516
sort_gpt4o_1                33
sort_gpt4o_2               516
cited_by                   139
len_cited_by                 0
ref_count                    0
cited_by_count               0
Q1                           0
Q1_1                         0
Q2                           0
Q2_1                        62
Q2_2                       555
Q3                          55
Q4                         538
abs                          0
index                        0
category_n1                  0
measure 

In [4]:
df['period'] = pd.cut(df['year'], bins=[1900, 2000, 2005, 2010, 2015, 2020, 2025], right=True, labels=["-2000", "2001-2005", "2006-2010", "2011-2015", "2016-2020", "2021-2025"])
df['period'].value_counts().sort_index()

period
-2000         23
2001-2005     23
2006-2010     65
2011-2015    141
2016-2020    183
2021-2025    182
Name: count, dtype: int64

# 1. Feature-Only KG in Neo4j

In [6]:
import pandas as pd
from neo4j import GraphDatabase

# --- Connect ---
driver = GraphDatabase.driver("bolt://localhost:7690", auth=("neo4j", "your_password"))

# Normalize categories
df['category_1'] = df['category_1'].replace({'Others': 'Others_Measure'})
df['category_2'] = df['category_2'].replace({'Others': 'Others_Measure'})
df['data_cat']   = df['data_cat'].replace({'Others': 'Others_DataType'})
df['rq_cat']     = df['rq_cat'].replace({'Others': 'Others_RqType'})

def safe_str(val):
    if pd.isna(val) or str(val).strip().lower() in {"", "nan", "none"}:
        return None
    return str(val).strip()

# --- Constraints ---
with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Measure)   REQUIRE m.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (ds:DataType) REQUIRE ds.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:RqType)    REQUIRE r.name IS UNIQUE")

# --- Collect rows ---
rows = []
for _, row in df.iterrows():
    m1       = safe_str(row.get("category_1"))
    # m2       = safe_str(row.get("category_2"))
    datatype = safe_str(row.get("data_cat"))
    rqtype   = safe_str(row.get("rq_cat"))
    period = row.get("period")
    paper_id = safe_str(row.get("id"))

    if not all([m1, datatype, rqtype, period]):
        continue  # skip incomplete rows

    rows.append({
        "m1": m1,
        # "m2": m2,
        "datatype": datatype,
        "rqtype": rqtype,
        "period": period,
        "paper_id": paper_id
    })

# --- Cypher with undirected edges ---
cypher = """
UNWIND $rows AS row

MERGE (m1:Measure {name: row.m1})
MERGE (ds:DataType {name: row.datatype})
MERGE (rq:RqType {name: row.rqtype})

// Measure -- DataType
MERGE (m1)-[r1:CO_MEASURE_DATATYPE {period: row.period}]-(ds)
  ON CREATE SET r1.w = 1, r1.papers=[row.paper_id]
  ON MATCH  SET r1.w = r1.w + 1, r1.papers = apoc.coll.toSet(coalesce(r1.papers, []) + row.paper_id)

// DataType -- RqType
MERGE (ds)-[r2:CO_DATATYPE_RQTYPE {period: row.period}]-(rq)
  ON CREATE SET r2.w = 1, r2.papers=[row.paper_id]
  ON MATCH  SET r2.w = r2.w + 1, r2.papers = apoc.coll.toSet(coalesce(r2.papers, []) + row.paper_id)

// RqType -- Measure
MERGE (rq)-[r3:CO_RQTYPE_MEASURE {period: row.period}]-(m1)
  ON CREATE SET r3.w = 1, r3.papers=[row.paper_id]
  ON MATCH  SET r3.w = r3.w + 1, r3.papers = apoc.coll.toSet(coalesce(r3.papers, []) + row.paper_id)

"""

with driver.session() as session:
    if rows:
        session.run(cypher, rows=rows)

driver.close()


# 1-3. [Triplet-approach] Creating Decay Weight for Resurgence
* Don't use this decaying weight - incorrect. BUT for obtaining year-wise dataframe, OK.
* KG itself is fine.

In [7]:
import math
from neo4j import GraphDatabase

# --- Connect ---
driver = GraphDatabase.driver("bolt://localhost:7690", auth=("neo4j", "your_password"))

# Normalize categories
df['category_1'] = df['category_1'].replace({'Others': 'Others_Measure'})
df['data_cat']   = df['data_cat'].replace({'Others': 'Others_DataType'})
df['rq_cat']     = df['rq_cat'].replace({'Others': 'Others_RqType'})
df['triplet_key'] = df['category_1'] + ' | ' + df['data_cat'] + ' | ' + df['rq_cat']

def safe_str(val):
    if pd.isna(val) or str(val).strip().lower() in {"", "nan", "none"}:
        return None
    return str(val).strip()

# ---- Time unit and half-life ----
HALF_LIFE_YEARS = 5.0
LAMBDA = math.log(2.0) / HALF_LIFE_YEARS

params = {"lambda": LAMBDA}


# --- Constraints ---
with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Measure)   REQUIRE m.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (ds:DataType) REQUIRE ds.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:RqType)    REQUIRE r.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (t:Triplet) REQUIRE t.key IS UNIQUE")

# --- Collect rows ---
rows = []
for _, row in df.iterrows():
    m1       = safe_str(row.get("category_1"))
    datatype = safe_str(row.get("data_cat"))
    rqtype   = safe_str(row.get("rq_cat"))
    paper_id = safe_str(row.get("id"))
    year = row.get("year")
    triplet_key = safe_str(row.get("triplet_key"))

    if not all([m1, datatype, rqtype]):
        continue  # skip incomplete rows

    rows.append({
        "m1": m1,
        "datatype": datatype,
        "rqtype": rqtype,
        "paper_id": paper_id,
        "year": year,
        "triplet_key": triplet_key,
    })

# --- Cypher with undirected edges ---
cypher = """
UNWIND $rows AS row

MERGE (m:Measure  {name: row.m1})
MERGE (d:DataType {name: row.datatype})
MERGE (r:RqType   {name: row.rqtype})

MERGE (t:Triplet {key: row.triplet_key})
    ON CREATE SET 
        t.m = row.m1, 
        t.d = row.datatype, 
        t.r = row.rqtype,
        t.count = 1,
        t.first_year = row.year,
        t.last_year = row.year,
        t.years = [row.year],
        t.paper_id = [row.paper_id],
        t.ew = 3.0,
        t.ewlist = [3.0]
    ON MATCH SET
        t.count = coalesce(t.count, 0) + 1,
        t.last_year = row.year,
        t.years = CASE WHEN row.year IN t.years THEN t.years ELSE t.years + [row.year] END,
        t.paper_id = CASE WHEN row.paper_id IS NULL OR row.paper_id IN t.paper_id THEN t.paper_id ELSE t.paper_id + [row.paper_id] END,
        t.ew = coalesce(t.ew,0.0) * exp(-$lambda * (toInteger(row.year)-toInteger(t.last_year))) + 3.0,
        t.ewlist = apoc.coll.toSet(coalesce(t.ewlist, []) + t.ew)        

// Connect fact to the three attribute nodes for easy filtering
MERGE (ty)-[:USES_MEASURE]->(m)
MERGE (ty)-[:USES_DATATYPE]->(d)
MERGE (ty)-[:USES_RQTYPE]->(r)
"""

with driver.session() as session:
    if rows:
        session.run(cypher, rows=rows, **params)

driver.close()


# 2. Centrality Measures

In [10]:
# pip install neo4j pandas
from neo4j import GraphDatabase
import pandas as pd

# --- CONFIG ---
NEO4J_URI  = "bolt://localhost:7690"
NEO4J_AUTH = ("neo4j", "your_password")
GRAPH_NAME = "features"

REL_TYPES = [
    "CO_MEASURE_DATATYPE",
    "CO_DATATYPE_RQTYPE",
    "CO_RQTYPE_MEASURE"
]

# --- UTILS ---
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

def run_df(cypher, params=None):
    """Run Cypher and return pandas DataFrame."""
    with driver.session() as s:
        result = s.run(cypher, params or {})
        data = result.data()
    return pd.DataFrame(data)

def run(cypher, params=None):
    with driver.session() as s:
        s.run(cypher, params or {})
        
def graph_name_for_period(period: str) -> str:
    # sanitize if periods contain spaces etc.; adjust if needed
    return f"{GRAPH_NAME}__{period}"

# --- 1) GDS PROJECTION (UNDIRECTED, WEIGHTED) ---
def period_graph_projection(period=str):
    name = graph_name_for_period(period)
    # Drop if exists
    run("CALL gds.graph.drop($name, false) YIELD graphName", {"name": name})

    cypher = """
    CALL gds.graph.project.cypher(
      $name,
      // Nodes
      'MATCH (n:Measure|DataType|RqType)
       RETURN id(n) AS id, labels(n) AS labels',
      // Relationships (all three types), undirected, with w and inv_w as top-level props
      'MATCH (a)-[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-(b)
       WHERE r.period = $period
       RETURN id(a) AS source,
              id(b) AS target,
              type(r) AS type,
              coalesce(r.w, 1.0) AS w,
              CASE WHEN coalesce(r.w,0) > 0 THEN 1.0 / r.w ELSE 1e12 END AS inv_w,
              "UNDIRECTED" AS orientation',
      { parameters: {period: $period} }
    )
    YIELD graphName, nodeCount, relationshipCount
    """
    df = run_df(cypher, {"name": name, "period": period})
    print(df)


# --- 2) METRICS (STREAM) ---
def top_degree(name: str):
    cypher = f"""
    CALL gds.degree.stream($name)
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": name})

def top_strength(name: str):
    cypher = f"""
    CALL gds.degree.stream($name, {{relationshipWeightProperty:'w'}})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score AS strength
    ORDER BY strength DESC
    """
    return run_df(cypher, {"name": name})

def top_betweenness(name: str, weight_prop: str | None = None):
    opts = "{}" if weight_prop is None else f"{{relationshipWeightProperty:'{weight_prop}'}}"
    cypher = f"""
    CALL gds.betweenness.stream($name, {opts})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": name})

def hops_count(name: str):
    cypher = f"""
    CALL gds.allShortestPaths.stream($name)
    YIELD sourceNodeId, targetNodeId, distance
    RETURN gds.util.asNode(sourceNodeId).name AS source,
        gds.util.asNode(targetNodeId).name AS target,
        distance AS hops
    ORDER BY hops DESC
    """
    return run_df(cypher, {"name": name})

In [11]:
import networkx as nx

def edge_betweenness_per_period(period: str) -> pd.DataFrame:
    # Pull edges for the given period
    edges_df = run_df(f"""
    MATCH (a)-[r:{'|'.join(REL_TYPES)}]-(b)
    WHERE r.period = $period
    RETURN a.name AS u, labels(a)[0] AS kind_u,
           b.name AS v, labels(b)[0] AS kind_v,
           coalesce(r.w,1) AS w, id(r) AS rel_id, type(r) AS rel_type
    """, { "period": period })

    # Pull nodes (all types)
    nodes_df = run_df("""
    MATCH (n:Measure) RETURN n.name AS name, 'Measure' AS kind
    UNION ALL
    MATCH (n:DataType) RETURN n.name AS name, 'DataType' AS kind
    UNION ALL
    MATCH (n:RqType) RETURN n.name AS name, 'RqType' AS kind
    """)

    # Build graph
    G = nx.Graph()
    for _, row in nodes_df.iterrows():
        G.add_node(row["name"], kind=row["kind"])
    for _, row in edges_df.iterrows():
        u, v = row["u"], row["v"]
        w    = float(row["w"])
        if G.has_edge(u, v):
            G[u][v]["weight"] += w
            G[u][v]["rel_ids"].append(row["rel_id"])
            G[u][v]["rel_types"].add(row["rel_type"])
        else:
            G.add_edge(u, v,
                       weight=w,
                       rel_ids=[row["rel_id"]],
                       rel_types={row["rel_type"]})

    # Unweighted betweenness
    edge_betweenness = nx.edge_betweenness_centrality(G, normalized=True)
    edge_betweenness_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness.items()],
        columns=["u", "v", "edge_betweenness"]
    )

    # Weighted betweenness (inverse weight as length)
    for u, v, data in G.edges(data=True):
        data["length"] = 1.0 / data["weight"]
    edge_betweenness_w = nx.edge_betweenness_centrality(G, normalized=True, weight="length")
    edge_betweenness_w_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness_w.items()],
        columns=["u", "v", "edge_betweenness_weighted"]
    )

    # Merge and return
    return edge_betweenness_df.merge(edge_betweenness_w_df, on=["u", "v"], how="outer")


In [12]:
for p in df['period'].cat.categories:   # your 6 categorical periods
    period_df = edge_betweenness_per_period(p)
    os.makedirs(f"{path}/results/feature-only-KG/periods/edge_betweenness", exist_ok=True)
    period_df.to_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{p}_{today}.csv")




In [14]:
# Updated: The previous version counts also degree=0 nodes
import networkx as nx
import pandas as pd
import os

def edge_betweenness_per_period(period: str) -> pd.DataFrame:
    # Pull edges for the given period (only those with at least one relationship)
    edges_df = run_df(f"""
    MATCH (a)-[r:{'|'.join(REL_TYPES)}]-(b)
    WHERE r.period = $period
    RETURN a.name AS u, labels(a)[0] AS kind_u,
           b.name AS v, labels(b)[0] AS kind_v,
           coalesce(r.w,1) AS w, id(r) AS rel_id, type(r) AS rel_type
    """, {"period": period})

    # ‚úÖ Collect only nodes that actually appear in these edges
    active_nodes = pd.unique(edges_df[['u', 'v']].values.ravel('K'))
    kinds_lookup = (
        pd.concat([
            run_df("MATCH (n:Measure) RETURN n.name AS name, 'Measure' AS kind"),
            run_df("MATCH (n:DataType) RETURN n.name AS name, 'DataType' AS kind"),
            run_df("MATCH (n:RqType) RETURN n.name AS name, 'RqType' AS kind")
        ])
        .drop_duplicates(subset=["name"])
        .set_index("name")["kind"]
        .to_dict()
    )

    # --- Build the graph only with active nodes ---
    G = nx.Graph()
    for node in active_nodes:
        kind = kinds_lookup.get(node, "Unknown")
        G.add_node(node, kind=kind)

    for _, row in edges_df.iterrows():
        u, v = row["u"], row["v"]
        w = float(row["w"])
        if G.has_edge(u, v):
            G[u][v]["weight"] += w
            G[u][v]["rel_ids"].append(row["rel_id"])
            G[u][v]["rel_types"].add(row["rel_type"])
        else:
            G.add_edge(u, v,
                       weight=w,
                       rel_ids=[row["rel_id"]],
                       rel_types={row["rel_type"]})

    # --- Unweighted edge betweenness ---
    edge_betweenness = nx.edge_betweenness_centrality(G, normalized=True)
    edge_betweenness_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness.items()],
        columns=["u", "v", "edge_betweenness"]
    )

    # --- Weighted betweenness (inverse weight as distance) ---
    for u, v, data in G.edges(data=True):
        data["length"] = 1.0 / data["weight"]
    edge_betweenness_w = nx.edge_betweenness_centrality(
        G, normalized=True, weight="length"
    )
    edge_betweenness_w_df = pd.DataFrame(
        [(u, v, score) for (u, v), score in edge_betweenness_w.items()],
        columns=["u", "v", "edge_betweenness_weighted"]
    )

    # Merge and return
    return edge_betweenness_df.merge(edge_betweenness_w_df, on=["u", "v"], how="outer")


# --- Save results per period ---
for p in df["period"].cat.categories:
    period_df = edge_betweenness_per_period(p)
    os.makedirs(f"{path}/results/feature-only-KG/periods/edge_betweenness", exist_ok=True)
    period_df.to_csv(f"{path}/results/feature-only-KG/periods/edge_betweenness/{p}_{today}.csv", index=False)




# 2. Temporal

In [7]:
df['period'].cat.categories

Index(['-2000', '2001-2005', '2006-2010', '2011-2015', '2016-2020',
       '2021-2025'],
      dtype='object')

## Checking the connectivity

In [12]:
import networkx as nx
from neo4j import GraphDatabase

# adjust to your settings
uri = "bolt://localhost:7690"
auth = ("neo4j", "your_password")
driver = GraphDatabase.driver(uri, auth=auth)

def load_tripartite_graph(period=None):
    """
    Load the Measure‚ÄìDataType‚ÄìRqType network from Neo4j into a NetworkX undirected graph.
    If `period` is given, only edges with that r.period are used.
    """
    cypher = """
    MATCH (a:Measure|DataType|RqType)
          -[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-
          (b:Measure|DataType|RqType)
    {where_clause}
    RETURN labels(a)[0] AS t1, a.name AS n1,
           labels(b)[0] AS t2, b.name AS n2
    """
    if period is None:
        where_clause = ""
        params = {}
    else:
        where_clause = "WHERE r.period = $period"
        params = {"period": period}

    cypher = cypher.format(where_clause=where_clause)

    G = nx.Graph()

    with driver.session() as session:
        result = session.run(cypher, **params)
        for rec in result:
            t1, n1, t2, n2 = rec["t1"], rec["n1"], rec["t2"], rec["n2"]
            # Use (label, name) as node id to avoid collisions
            u = (t1, n1)
            v = (t2, n2)
            G.add_edge(u, v)

    return G


In [13]:
def connectivity_report(G, label="graph"):
    """
    Print whether the graph is fully connected and describe disconnected components.
    """
    n_nodes = G.number_of_nodes()
    n_edges = G.number_of_edges()
    components = list(nx.connected_components(G))
    n_comp = len(components)

    print(f"=== Connectivity report for {label} ===")
    print(f"Nodes: {n_nodes}, Edges: {n_edges}")
    print(f"Connected components: {n_comp}")

    if n_comp == 0:
        print("Graph is empty.")
        return

    if n_comp == 1:
        print("‚úÖ Fully connected: no isolated subgraphs.")
    else:
        print("‚ùå Not fully connected: there are isolated subgraphs.")
        sizes = sorted((len(c) for c in components), reverse=True)
        print("Component sizes (descending):", sizes)
        giant = max(components, key=len)
        print(f"Largest component size: {len(giant)} "
              f"({len(giant)/n_nodes:.1%} of all nodes)")

        # Optionally list the small components
        print("\nSmall components (size ‚â§ 5):")
        for comp in components:
            if len(comp) <= 5:
                print(f"  size {len(comp)}:", comp)


In [14]:
# Whole Graph Connectivity Report
G_all = load_tripartite_graph(period=None)
connectivity_report(G_all, label="all periods combined")

=== Connectivity report for all periods combined ===
Nodes: 30, Edges: 168
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.


In [15]:
# Per-Period Connectivity Reports
def get_periods():
    with driver.session() as session:
        res = session.run("""
            MATCH ()-[r:CO_MEASURE_DATATYPE|CO_DATATYPE_RQTYPE|CO_RQTYPE_MEASURE]-()
            RETURN DISTINCT r.period AS period
            ORDER BY period
        """)
        return [rec["period"] for rec in res]

periods = get_periods()
for p in periods:
    G_p = load_tripartite_graph(period=p)
    connectivity_report(G_p, label=f"period {p}")

=== Connectivity report for period -2000 ===
Nodes: 16, Edges: 32
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.
=== Connectivity report for period 2001-2005 ===
Nodes: 18, Edges: 41
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.
=== Connectivity report for period 2006-2010 ===
Nodes: 23, Edges: 64
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.
=== Connectivity report for period 2011-2015 ===
Nodes: 27, Edges: 103
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.
=== Connectivity report for period 2016-2020 ===
Nodes: 27, Edges: 120
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.
=== Connectivity report for period 2021-2025 ===
Nodes: 28, Edges: 123
Connected components: 1
‚úÖ Fully connected: no isolated subgraphs.


## Centrality Measures

In [27]:
for p in df['period'].cat.categories:
    print(f"\n--- Period: {p} ---")
    period_graph_projection(period=p)
    gname = graph_name_for_period(p)
    top_degree_df = top_degree(name=gname)
    top_strength_df = top_strength(name=gname)
    top_betweenness_df = top_betweenness(name=gname, weight_prop='inv_w')
    hops_count_df = hops_count(name=gname)
    top_betweenness_noweight_df = top_betweenness(name=gname, weight_prop=None)

    os.makedirs(f"{path}/results/feature-only-KG/periods", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_betweenness", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_degree", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_strength", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/hops_count", exist_ok=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/top_betweenness_noweight", exist_ok=True)

    top_betweenness_df.to_csv(f"{path}/results/feature-only-KG/periods/top_betweenness/{p}_{today}.csv")
    top_degree_df.to_csv(f"{path}/results/feature-only-KG/periods/top_degree/{p}_{today}.csv")
    top_strength_df.to_csv(f"{path}/results/feature-only-KG/periods/top_strength/{p}_{today}.csv")
    hops_count_df.to_csv(f"{path}/results/feature-only-KG/periods/hops_count/{p}_{today}.csv")
    top_betweenness_noweight_df.to_csv(f"{path}/results/feature-only-KG/periods/top_betweenness_noweight/{p}_{today}.csv")




--- Period: -2000 ---
         graphName  nodeCount  relationshipCount
0  features__-2000         30                 64

--- Period: 2001-2005 ---




             graphName  nodeCount  relationshipCount
0  features__2001-2005         30                 82

--- Period: 2006-2010 ---
             graphName  nodeCount  relationshipCount
0  features__2006-2010         30                128





--- Period: 2011-2015 ---
             graphName  nodeCount  relationshipCount
0  features__2011-2015         30                206

--- Period: 2016-2020 ---
             graphName  nodeCount  relationshipCount
0  features__2016-2020         30                240





--- Period: 2021-2025 ---
             graphName  nodeCount  relationshipCount
0  features__2021-2025         30                246


## Degree/2*triplets

In [126]:
# ‚ö†Ô∏è MAKE SURE that df is the one under 0.Libraries
# Run 2.Centrality Measures

cypher = """
// For Measures
MATCH (m:Measure)-[:CO_MEASURE_DATATYPE {period:$period}]-(d:DataType)
      -[:CO_DATATYPE_RQTYPE {period:$period}]-(r:RqType)
      -[:CO_RQTYPE_MEASURE {period:$period}]-(m)
WITH m, count(DISTINCT [m,d,r]) AS triplets
MATCH (m)-[rel {period:$period}]-(n)
WITH m.name AS node, 'Measure' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree

UNION ALL
// For DataTypes
MATCH (d:DataType)-[:CO_DATATYPE_RQTYPE {period:$period}]-(r:RqType)
      -[:CO_RQTYPE_MEASURE {period:$period}]-(m:Measure)
      -[:CO_MEASURE_DATATYPE {period:$period}]-(d)
WITH d, count(DISTINCT [m,d,r]) AS triplets
MATCH (d)-[rel {period:$period}]-(n)
WITH d.name AS node, 'DataType' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree

UNION ALL
// For RqTypes
MATCH (r:RqType)-[:CO_RQTYPE_MEASURE {period:$period}]-(m:Measure)
      -[:CO_MEASURE_DATATYPE {period:$period}]-(d:DataType)
      -[:CO_DATATYPE_RQTYPE {period:$period}]-(r)
WITH r, count(DISTINCT [m,d,r]) AS triplets
MATCH (r)-[rel {period:$period}]-(n)
WITH r.name AS node, 'RqType' AS kind,
     size(collect(DISTINCT n)) AS degree, triplets
RETURN node, kind, degree, triplets, 
       toFloat(degree) / (2.0 * triplets) AS normalized_degree
"""



for p in df['period'].cat.categories:
    print(f"\n--- Period: {p} ---")
    write_table = pd.DataFrame()
    period = p
    df = run_df(cypher, {"period": period})
    write_table = pd.concat([write_table, df], ignore_index=True)
    os.makedirs(f"{path}/results/feature-only-KG/periods/degree_normalized", exist_ok=True)
    write_table.to_csv(f"{path}/results/feature-only-KG/periods/degree_normalized/{p}_{today}.csv", index=False)




--- Period: -2000 ---

--- Period: 2001-2005 ---

--- Period: 2006-2010 ---

--- Period: 2011-2015 ---

--- Period: 2016-2020 ---

--- Period: 2021-2025 ---
