In [None]:
import os
from py2neo import Graph, Node, Relationship

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

if not all([NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD]):
    raise ValueError("Missing env vars: NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD")

graph = Graph(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))


In [None]:
import pandas as pd

In [None]:
from pathlib import Path
DATA_PATH = Path("..") / "disease.csv"
df = pd.read_csv(DATA_PATH)

In [None]:
# Remove meaningless columns
df = df.drop(
    columns=[f"Unnamed: {i}" for i in range(15, 20)],
    errors="ignore"
)

In [None]:
# Clean noisy textual artifacts from string-valued cells while preserving non-text data.
def data_uniform(data):
    if isinstance(data, str):
        cleaned=data.replace('[详细]','')
        cleaned=cleaned.replace('...','').strip()
        return cleaned
    else:
        return data

In [None]:
# Build a unique list of disease nodes (names) and a unique list of disease attribute dictionaries to later create Neo4j nodes.
disease_attributes=[]
name_nodes=[]

_seen_names = set()
_seen_dicts = set()

for index,row in df.iterrows():
    names_dic = {
        "name": data_uniform(row[0]),
        "age": data_uniform(row[3]),
        "infection": data_uniform(row[4]),
        "insurance": data_uniform(row[5]),
        "checklist": data_uniform(row[7]),
        "treatment": data_uniform(row[10]),
        "period": data_uniform(row[12]),
        "rate": data_uniform(row[13]),
        "money": data_uniform(row[14]),
    }
    key = tuple(sorted(names_dic.items()))
    if key not in _seen_dicts:
        disease_attributes.append(names_dic)
        _seen_dicts.add(key)

    name = names_dic["name"]
    if name not in _seen_names:
        name_nodes.append(name)
        _seen_names.add(name)

  names_dic['name']=data_uniform(row[0])
  names_dic['age']=data_uniform(row[3])
  names_dic['infection']=data_uniform(row[4])
  names_dic['insurance']=data_uniform(row[5])
  names_dic['checklist']=data_uniform(row[7])
  names_dic['treatment']=data_uniform(row[10])
  names_dic['period']=data_uniform(row[12])
  names_dic['rate']=data_uniform(row[13])
  names_dic['money']=data_uniform(row[14])
  if data_uniform(row[0]) not in name_nodes:
  name_nodes.append(data_uniform(row[0]))


In [None]:
import re

# Lists to store unique entity names for each node type in the knowledge graph
alias_nodes, part_nodes, department_nodes = [], [], []
symptom_nodes, complication_nodes, drug_nodes = [], [], []

alias_seen, part_seen, department_seen = set(), set(), set()
symptom_seen, complication_seen, drug_seen = set(), set(), set()

# Function to normalize text fields and split multi-valued cells into individual entity names
def data_addtolist(data, lists, seen):
    data2=data_uniform(data)
    if isinstance(data2, str):
        # Each column may contain multiple entities separated by delimiters
        for i in re.split('，|,| |、', data2):
            i = i.strip()
            if i and i not in seen:
                lists.append(i)
                seen.add(i)

# Iterate through the dataset and extract entity names from relevant columns
for index,row in df.iterrows():
    data_addtolist(row[1],alias_nodes, alias_seen)
    data_addtolist(row[2],part_nodes, part_seen)
    data_addtolist(row[6],department_nodes, department_seen)
    data_addtolist(row[8],symptom_nodes, symptom_seen)
    data_addtolist(row[9], complication_nodes, complication_seen)
    data_addtolist(row[11],drug_nodes, drug_seen)

  data_addtolist(row[1],alias_nodes)
  data_addtolist(row[2],part_nodes)
  data_addtolist(row[6],department_nodes)
  data_addtolist(row[8],symptom_nodes)
  data_addtolist(row[9], complication_nodes)
  data_addtolist(row[11],drug_nodes)


In [None]:
from py2neo import NodeMatcher

In [None]:
# Fetch a node by label and attrs['name']
def match_node(graph, label, attrs):
    n = "_.name=" + "\"" + attrs["name"] + "\""
    matcher = NodeMatcher(graph)
    return matcher.match(label).where(n).first()

In [None]:
# Fetch a node by label and name
def match_node2(graph, label, name):
    n = "_.name=" + "\"" + str(name) + "\""
    matcher = NodeMatcher(graph)
    return matcher.match(label).where(n).first()

In [None]:
# Lists to store pairs for each relationship type, and each element is [disease_name, related_entity_name].
name_alias=[]
name_part=[]
name_department=[]
name_symptom=[]
name_complication=[]
name_drug=[]

# Helper function to extract relationship pairs from a table cell.
def rel_tolist(edge_list, node1, node2):
    # normalize both ends
    src = data_uniform(node1)
    dst_raw = data_uniform(node2)

    if isinstance(dst_raw, str):
        for dst in re.split('，|,| |、', dst_raw):
            dst = dst.strip()
            if dst:
                pair = [src, dst]
                if pair not in edge_list:
                    edge_list.append(pair)

for index,row in df.iterrows():
    rel_tolist(name_alias,row[0],row[1])
    rel_tolist(name_part,row[0],row[2])
    rel_tolist(name_department,row[0],row[6])
    rel_tolist(name_symptom,row[0],row[8])
    rel_tolist(name_complication,row[0],row[9])
    rel_tolist(name_drug,row[0],row[11])

  rel_tolist(name_alias,row[0],row[1])
  rel_tolist(name_part,row[0],row[2])
  rel_tolist(name_department,row[0],row[6])
  rel_tolist(name_symptom,row[0],row[8])
  rel_tolist(name_complication,row[0],row[9])
  rel_tolist(name_drug,row[0],row[11])


In [None]:
for d in disease_attributes:
    graph.merge(Node("Disease", **d), "Disease", "name")

In [None]:
# Create relationships while adding nodes.
# Since nodes already are uniquely identified by (label, name).
def create_relationship(graph, label1, name1, label2, name2, r_name):
    # Create node objects
    n1 = Node(label1, name=str(name1))
    n2 = Node(label2, name=str(name2))
    graph.merge(n1, label1, "name")
    graph.merge(n2, label2, "name")

    # MERGE relationship to avoid duplicates
    r = Relationship(n1, r_name, n2)
    graph.merge(r)
    return True

In [None]:
for i in name_alias:
    create_relationship(graph, "Disease", i[0], "alias", i[1], "病症别名")

In [None]:
for i in name_part:
    create_relationship(graph, "Disease", i[0], "part", i[1], "病痛的部位")

In [None]:
for i in name_department:
    create_relationship(graph, "Disease", i[0], "department", i[1], "疾病所属部门")

In [None]:
for i in name_symptom:
    create_relationship(graph, 'Disease', i[0], 'symptom',i[1],'疾病症状')

In [None]:
for i in name_complication:
    create_relationship(graph, 'Disease', i[0], 'Disease',i[1],'疾病并发症')

In [None]:
for i in name_drug:
    create_relationship(graph, 'Disease', i[0], 'drug',i[1],'疾病所需药物')