# 0. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'üìÇ Current working directory: {path}')
print(f'üíö Today is {today}')
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))
import ss_api_call as ss

üìÇ Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
üíö Today is 2025-08-20


# 1. Feature-Only KG in Neo4j

In [2]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

In [4]:
df.isna().sum()

id                           0
title                        0
year                         0
doi                         87
landing_page                 1
abstract_inverted_index     33
language                     0
is_oa                        0
oa_status                    0
oa_link                    295
abstract                    33
abstract_sm                104
authors_sm                 508
domain                       2
sort_gpt_1                  34
sort_gpt_2                 516
sort_gpt4o_1                33
sort_gpt4o_2               516
cited_by                   139
len_cited_by                 0
ref_count                    0
cited_by_count               0
Q1                           0
Q1_1                         0
Q2                           0
Q2_1                        62
Q2_2                       555
Q3                          55
Q4                         538
abs                          0
index                        0
category_n1                  0
measure 

In [7]:
import pandas as pd
from neo4j import GraphDatabase

# --- Connect ---
driver = GraphDatabase.driver("bolt://localhost:7690", auth=("neo4j", "your_password"))

# Normalize categories
df['category_1'] = df['category_1'].replace({'Others': 'Others_Measure'})
df['category_2'] = df['category_2'].replace({'Others': 'Others_Measure'})
df['data_cat']   = df['data_cat'].replace({'Others': 'Others_DataType'})
df['rq_cat']     = df['rq_cat'].replace({'Others': 'Others_RqType'})

def safe_str(val):
    if pd.isna(val) or str(val).strip().lower() in {"", "nan", "none"}:
        return None
    return str(val).strip()

# --- Constraints ---
with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Measure)   REQUIRE m.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (ds:DataType) REQUIRE ds.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:RqType)    REQUIRE r.name IS UNIQUE")

# --- Collect rows ---
rows = []
for _, row in df.iterrows():
    m1       = safe_str(row.get("category_1"))
    # m2       = safe_str(row.get("category_2"))
    datatype = safe_str(row.get("data_cat"))
    rqtype   = safe_str(row.get("rq_cat"))

    if not all([m1, datatype, rqtype]):
        continue  # skip incomplete rows

    rows.append({
        "m1": m1,
        # "m2": m2,
        "datatype": datatype,
        "rqtype": rqtype
    })

# --- Cypher with undirected edges ---
cypher = """
UNWIND $rows AS row

MERGE (m1:Measure {name: row.m1})
MERGE (ds:DataType {name: row.datatype})
MERGE (rq:RqType {name: row.rqtype})

// Measure -- DataType
MERGE (m1)-[r1:CO_MEASURE_DATATYPE]-(ds)
  ON CREATE SET r1.w = 1
  ON MATCH  SET r1.w = r1.w + 1

// DataType -- RqType
MERGE (ds)-[r2:CO_DATATYPE_RQTYPE]-(rq)
  ON CREATE SET r2.w = 1
  ON MATCH  SET r2.w = r2.w + 1

// RqType -- Measure
MERGE (rq)-[r3:CO_RQTYPE_MEASURE]-(m1)
  ON CREATE SET r3.w = 1
  ON MATCH  SET r3.w = r3.w + 1

"""

with driver.session() as session:
    if rows:
        session.run(cypher, rows=rows)

driver.close()


In [None]:
# // Optional second measure
# FOREACH (_ IN CASE WHEN row.m2 IS NOT NULL THEN [1] ELSE [] END |
#   MERGE (m2:Measure {name: row.m2})

#   MERGE (m2)-[r4:CO_MEASURE_DATATYPE]-(ds)
#     ON CREATE SET r4.w = 1
#     ON MATCH  SET r4.w = r4.w + 1

#   MERGE (rq)-[r5:CO_RQTYPE_MEASURE]-(m2)
#     ON CREATE SET r5.w = 1
#     ON MATCH  SET r5.w = r5.w + 1

#   MERGE (m1)-[r6:CO_MEASURE_MEASURE]-(m2)
#     ON CREATE SET r6.w = 1
#     ON MATCH  SET r6.w = r6.w + 1
# )

In [23]:
# pip install neo4j pandas
from neo4j import GraphDatabase
import pandas as pd

# --- CONFIG ---
NEO4J_URI  = "bolt://localhost:7690"
NEO4J_AUTH = ("neo4j", "your_password")
GRAPH_NAME = "features"

REL_TYPES = [
    "CO_MEASURE_DATATYPE",
    "CO_DATATYPE_RQTYPE",
    "CO_RQTYPE_MEASURE"
]

# --- UTILS ---
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

def run_df(cypher, params=None):
    """Run Cypher and return pandas DataFrame."""
    with driver.session() as s:
        result = s.run(cypher, params or {})
        data = result.data()
    return pd.DataFrame(data)

def run(cypher, params=None):
    with driver.session() as s:
        s.run(cypher, params or {})

# --- 1) GDS PROJECTION (UNDIRECTED, WEIGHTED) ---
def ensure_graph_projection():
    # Drop if exists
    run(f"CALL gds.graph.drop($name, false) YIELD graphName", {"name": GRAPH_NAME})

    rels_map = ",".join([
        f"{r}: {{orientation: 'UNDIRECTED', properties: 'w'}}"
        for r in REL_TYPES
    ])
    cypher = f"""
    CALL gds.graph.project(
      $name,
      ['Measure','DataType','RqType'],
      {{{rels_map}}}
    )
    YIELD graphName, nodeCount, relationshipCount;
    """
    df = run_df(cypher, {"name": GRAPH_NAME})
    print(df)

# --- 2) METRICS (STREAM) ---
def top_degree():
    cypher = f"""
    CALL gds.degree.stream($name)
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": GRAPH_NAME})

def top_strength():
    cypher = f"""
    CALL gds.degree.stream($name, {{relationshipWeightProperty:'w'}})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score AS strength
    ORDER BY strength DESC
    """
    return run_df(cypher, {"name": GRAPH_NAME})

def top_betweenness():
    cypher = f"""
    CALL gds.betweenness.stream($name, {{relationshipWeightProperty:'w'}})
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId) AS n, score
    RETURN labels(n)[0] AS kind, n.name AS node, score
    ORDER BY score DESC
    """
    return run_df(cypher, {"name": GRAPH_NAME})

def hops_count():
    cypher = f"""
    CALL gds.allShortestPaths.stream($name)
    YIELD sourceNodeId, targetNodeId, distance
    RETURN gds.util.asNode(sourceNodeId).name AS source,
        gds.util.asNode(targetNodeId).name AS target,
        distance AS hops
    ORDER BY hops DESC
    """
    return run_df(cypher, {"name": GRAPH_NAME})

def louvain():
    cypher = f"""
    CALL gds.louvain.stream($name, {{relationshipWeightProperty:'w'}})
    YIELD nodeId, communityId
    WITH communityId, gds.util.asNode(nodeId) AS n
    RETURN communityId,
           collect(n.name)[0..10] AS sample_members,
           count(*) AS size
    ORDER BY size DESC, communityId ASC
    """
    return run_df(cypher, {"name": GRAPH_NAME})

# --- 3) (OPTIONAL) WRITE SCORES BACK TO GRAPH ---
def write_strength_and_community():
    run("""
    CALL gds.degree.write($name, {
      relationshipWeightProperty: 'w',
      writeProperty: 'strength'
    })
    """, {"name": GRAPH_NAME})

    run("""
    CALL gds.louvain.write($name, {
      relationshipWeightProperty: 'w',
      writeProperty: 'community'
    })
    """, {"name": GRAPH_NAME})



In [24]:
if __name__ == "__main__":
    ensure_graph_projection()
    top_degree_df = top_degree()
    top_strength_df = top_strength()
    top_beetweenness_df = top_betweenness()
    hops_count_df = hops_count()
    louvain_df = louvain()

    # Optionally write back
    # write_strength_and_community()

    # Triplets to DataFrame / CSV
    # trips = get_triplets(limit=50)
    # print("\nSample triplets:")
    # print(trips.head(10))
    # export_triplets_csv("triplets.csv")

    driver.close()

  graphName  nodeCount  relationshipCount
0  features         30                336


In [27]:
top_beetweenness_df.to_csv(f"{path}/results/feature-only-KG/top_betweenness.csv")
top_degree_df.to_csv(f"{path}/results/feature-only-KG/top_degree.csv")
top_strength_df.to_csv(f"{path}/results/feature-only-KG/top_strength.csv")
hops_count_df.to_csv(f"{path}/results/feature-only-KG/hops_count.csv")
louvain_df.to_csv(f"{path}/results/feature-only-KG/louvain.csv")

In [26]:
hops_count_df

Unnamed: 0,source,target,hops
0,Big Data,Multigenerational Measures,3.0
1,Theoretical and Structural Models,Multigenerational Measures,3.0
2,Others_RqType,Multigenerational Measures,3.0
3,Perceptions of Mobility and Attitudes,Non‚Äêparametric Approaches,3.0
4,Multigenerational Measures,Big Data,3.0
...,...,...,...
865,Others_RqType,Linked Administrative Data,1.0
866,Absolute Mobility Measures,No dataset,1.0
867,Regression‚Äêbased Measures,No dataset,1.0
868,Transition Matrix / Probability Measures,No dataset,1.0
