In [None]:
from dotenv import load_dotenv
import os

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [None]:
import neo4j
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

ex_llm=OpenAILLM(
    model_name="gpt-4.1",
    model_params={
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

embedder = OpenAIEmbeddings(model="text-embedding-3-large")

# 定義實體及關聯

In [5]:
# define relationship types

rel_types = [
    "FUNDED_BY",            # 計畫由某組織資助
    "EXECUTED_BY",          # 計畫由某組織執行
    "DEVELOPS",             # 組織/計畫開發或優化系統/工具/資料來源
    "OPTIMIZES",            # 計畫優化系統/工具
    "HOLDS",                # 計畫舉辦事件
    "PRODUCES",             # 計畫產出資料來源 (報告/文章)
    "MAINTAINS",            # 組織維護資料來源/系統
    "ISSUES",               # 組織發布法規
    "EMPLOYS",              # 組織雇用人員 (注意: 原文姓名遮蔽)
    "AFFILIATED_WITH",      # 人員隸屬於組織
    "PARTICIPATES_IN",      # 人員/組織參與計畫
    "ATTENDS",              # 人員參與事件
    "ADVISES_ON",           # 人員對系統/計畫提供建議
    "USES_DATA_FROM",       # 系統使用資料來源
    "INTEGRATES",           # 系統整合資料來源
    "REFERENCES_TOOL",      # 系統參考其他工具/邏輯
    "BASED_ON_LOGIC_OF",    # 系統基於某工具的邏輯
    "APPLIES_METHOD",       # 系統應用某概念/方法
    "INCLUDES_MODULE",      # 系統包含某概念/模組
    "GENERATES",            # 系統產生資料 (報表/評估)
    "USES_TECHNOLOGY",      # 系統使用某技術/工具
    "LISTS",                # 資料來源列出化學物質
    "APPLIES_TO",           # 資料來源/法規適用於地點/概念(產業)
    "IDENTIFIED_BY",        # 化學物質由某概念標識 (CAS No./SMILES)
    "HAS_HAZARD",           # 化學物質具有某危害概念
    "CAN_BE_REPLACED_BY",   # 化學物質可被另一化學物質替代
    "RELATED_TO",           # 概念與概念相關
    "FOCUSES_ON",           # 事件聚焦於某系統/工具
    "PART_OF",              # 組織屬於某地點
    "HAS_KEYWORD",          # 計畫有關鍵字
]

basic_node_labels = [
    "Project",              # 代表具體的計畫，例如本計畫 "112DA023"
    "Organization",         # 代表組織或單位，如政府機關、大學、公司、國際組織等
    "Person",               # 代表參與計畫或被提及的個人 (不論是具名或角色)
    "SystemTool",           # 代表系統、軟體工具、模型、計算方法等，如 SAS, QSAR Toolbox, TEST
    "DataSource",           # 代表資料的來源，如資料庫、清單、報告、文章、網站等
    "ChemicalSubstance",    # 代表化學物質，可以是具體物質 (如 PFAS, 苯) 或物質類別
    "Concept",              # 代表抽象概念、方法學、屬性、危害類別等，如綠色化學, QSAR, 危害終點, CAS No.
    "Event",                # 代表發生的事件，如會議、教育訓練、工作坊等
    "Location",             # 代表地理位置或區域，如臺灣, 美國, 歐盟
    "Regulation",           # 代表法規、標準或指南，如 REACH, 加州 65 號提案
    "Keyword"
]

POTENTIAL_SCHEMA = [
    ("Project", "FUNDED_BY", "Organization"),
    ("Project", "EXECUTED_BY", "Organization"),
    ("Project", "DEVELOPS", "SystemTool"),
    ("Project", "OPTIMIZES", "SystemTool"),
    ("Project", "HOLDS", "Event"),
    ("Project", "PRODUCES", "DataSource"),
    ("Project", "HAS_KEYWORD", "Keyword"),
    ("Organization", "DEVELOPS", "SystemTool"),
    ("Organization", "MAINTAINS", "DataSource"),
    ("Organization", "ISSUES", "Regulation"),
    ("Organization", "EMPLOYS", "Person"), # Note: Usefulness limited by masked names
    ("Organization", "PART_OF", "Location"),
    ("Person", "AFFILIATED_WITH", "Organization"),
    ("Person", "PARTICIPATES_IN", "Project"),
    ("Person", "ATTENDS", "Event"),
    ("Person", "ADVISES_ON", "SystemTool"),
    ("SystemTool", "USES_DATA_FROM", "DataSource"),
    ("SystemTool", "INTEGRATES", "DataSource"),
    ("SystemTool", "REFERENCES_TOOL", "SystemTool"),
    ("SystemTool", "APPLIES_METHOD", "Concept"),
    ("SystemTool", "INCLUDES_MODULE", "Concept"),
    ("SystemTool", "USES_TECHNOLOGY", "SystemTool"), # For software dependencies
    ("SystemTool", "GENERATES", "DataSource"), # For reports/visualizations
    ("DataSource", "LISTS", "ChemicalSubstance"),
    ("DataSource", "APPLIES_TO", "Location"),
    ("DataSource", "APPLIES_TO", "Concept"), # e.g., Industry concept
    ("ChemicalSubstance", "IDENTIFIED_BY", "Concept"), # e.g., CAS No.
    ("ChemicalSubstance", "HAS_HAZARD", "Concept"), # e.g., Hazard Endpoint
    ("ChemicalSubstance", "CAN_BE_REPLACED_BY", "ChemicalSubstance"),
    ("Concept", "RELATED_TO", "Concept"),
    ("Event", "FOCUSES_ON", "SystemTool"),
    ("Regulation", "APPLIES_TO", "Location"),
]

# 中文prompt效果不佳
prompt_template = '''
You are a chemical researcher tasks with extracting information from papers 
and structuring it in a property graph to inform further research Q&A.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text. Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [None]:
from langchain_text_splitters import CharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=LangChainTextSplitterAdapter(
        CharacterTextSplitter(chunk_size=300, chunk_overlap=45, length_function=len, separator="")
    ),
    embedder=embedder, 
    entities=basic_node_labels,
    relations=rel_types,
    prompt_template=prompt_template, # 可選
    potential_schema=POTENTIAL_SCHEMA,
    from_pdf=True
)

pdf_file_paths = ['../test.pdf']

for path in pdf_file_paths:
    print(f"Processing : {path}")
    pdf_result = await kg_builder_pdf.run_async(file_path=path)
    print(f"Result: {pdf_result}")

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 54 0 (offset 0)
Ignoring wrong pointing object 66 0 (offset 0)
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)
Ignoring wrong pointing object 81 0 (offset 0)
Ignoring wrong 

Processing : ./test.pdf
Result: run_id='4424df03-3a93-4c77-8f5a-7b5924c2f5c1' result={'resolver': {'number_of_nodes_to_resolve': 1629, 'number_of_created_nodes': 1165}}


# neo4j操作語法

In [9]:
from random import sample

random_topic_list = sample(basic_node_labels,3)
topic_1 = random_topic_list[0]
topic_2 = random_topic_list[1]
topic_3 = random_topic_list[2]

query = f"""
MATCH (p:{topic_1}) WITH p ORDER BY rand() LIMIT 1
MATCH (c:{topic_2}) WITH p, c ORDER BY rand() LIMIT 1
MATCH (proj:{topic_3}) WITH p, c, proj ORDER BY rand() LIMIT 1
OPTIONAL MATCH (selected_node)-[r]-(external_node)
WHERE selected_node IN [p, c, proj] AND NOT external_node IN [p, c, proj]
RETURN p, c, proj,
       collect(DISTINCT {{
           source: selected_node,
           relationship: r,
           external_neighbor: external_node,
           external_neighbor_properties: properties(external_node)
       }}) AS external_connections_with_properties
"""

with driver.session() as session:
    result = session.read_transaction(lambda tx: tx.run(query).single()) # .single() 取得唯一的 Record

if result:
    # 1. 取得頂層節點物件 (與之前相同)
    p_node = result["p"]
    c_node = result["c"]
    proj_node = result["proj"]

    print(f"Selected {topic_1}: {p_node.get('name', 'N/A')} (ID: {p_node.id})")
    print(f"Selected {topic_2}: {c_node.get('name', 'N/A')} (ID: {c_node.id})")
    print(f"Selected {topic_3}: {proj_node.get('name', 'N/A')} (ID: {proj_node.id})")
    print("-" * 20)


    # 2. 取得外部連接的列表
    connections_list = result["external_connections_with_properties"]
    print(f"Found {len(connections_list)} external connections.")
    print("-" * 20)

    # --- 遍歷列表，從每個 Map 中取值 ---
    final_result = []

    for i, connection_info in enumerate(connections_list):
        # a) 取得來源節點物件 (Node)
        source_node = connection_info["source"]
        source_id = source_node.id
        labels_dict = dict(connection_info.items())['external_neighbor'].labels
        labels_to_exclude = {'__Entity__', '__KGBuilder__'} 
        source_labels = set(labels_dict)-labels_to_exclude # 移除default labels

        # b) 取得關係物件 (Relationship)
        relationship = connection_info["relationship"]
        relationship_type = relationship.type
        relationship_props = dict(relationship.items()) # 關係的屬性

        # c) 取得外部鄰居節點物件 (Node)
        neighbor_node = connection_info["external_neighbor"]
        neighbor_id = neighbor_node.id

        # d) 取得外部鄰居的屬性 Map
        neighbor_properties = connection_info["external_neighbor_properties"]

        relevant_neighbor_prop_name = None
        relevant_neighbor_prop_value = None

        # 檢查 relationship_type
        if relationship_type == "FROM_CHUNK" or relationship_type == "NEXT_CHUNK":
            relevant_neighbor_prop_name = "text"
            # 從 neighbor_properties 字典中安全地取得 'text' 屬性
            relevant_neighbor_prop_value = neighbor_properties.get(relevant_neighbor_prop_name, "N/A")

        else:
            # 對於所有其他 relationship_type
            relevant_neighbor_prop_name = "name"
            # 從 neighbor_properties 字典中安全地取得 'name' 屬性
            relevant_neighbor_prop_value = neighbor_properties.get(relevant_neighbor_prop_name, "N/A")

        # print(f"  Connection {i+1}:")
        # print(f"    Source: Node(id={source_id}, labels={source_labels}, name='{source_node.get('name',source_node.get('text', '...'))}')") # 簡單顯示來源節點名字或文字
        # print(f"    Relationship: Type='{relationship_type}', Props={relationship_props}")
        # print(f"    Neighbor: Node(id={neighbor_id})")
        # 不再印出完整的 neighbor_properties
        # print(f"    Neighbor Properties: {neighbor_properties}")

        # 印出根據條件篩選後的屬性
        # print(f"    Relevant Neighbor Property ('{relevant_neighbor_prop_name}'): {relevant_neighbor_prop_value}")
        # print("-" * 10)

        # print(f"  Connection {i+1}:")
        # print(f"    Source: Node(labels={source_labels}, name='{source_node.get('name',source_node.get('text', '...'))}')") # 簡單顯示來源節點名字或文字
        # print(f"    Relationship: Type='{relationship_type}', Props={relationship_props}")
        # # 印出根據條件篩選後的屬性
        # print(f"    Relevant Neighbor Property ('{relevant_neighbor_prop_name}'): {relevant_neighbor_prop_value}")
        # print("-" * 10)

        node_info = {
            "source_labels" : source_labels,
            "source_name" : source_node.get('name',source_node.get('text')),
            "relationship_type":relationship_type,
            "relationship_detail":relationship_props,
            "relevant_neighbor_prop_name":relevant_neighbor_prop_name,
            "relevant_neighbor_prop_value":relevant_neighbor_prop_value
        }
        final_result.append(node_info)
else:
    print("Query returned no results or mock data not provided.")

  result = session.read_transaction(lambda tx: tx.run(query).single()) # .single() 取得唯一的 Record


Selected SystemTool: My Chemical Monitoring (ID: 772)
Selected Concept: 交叉參照 (Read-Across, RAx) (ID: 1406)
Selected Keyword: Chemical Detection (ID: 852)
--------------------
Found 8 external connections.
--------------------


  print(f"Selected {topic_1}: {p_node.get('name', 'N/A')} (ID: {p_node.id})")
  print(f"Selected {topic_2}: {c_node.get('name', 'N/A')} (ID: {c_node.id})")
  print(f"Selected {topic_3}: {proj_node.get('name', 'N/A')} (ID: {proj_node.id})")
  source_id = source_node.id
  neighbor_id = neighbor_node.id


In [10]:
final_result

[{'source_labels': {'Chunk'},
  'source_name': 'My Chemical Monitoring',
  'relationship_type': 'FROM_CHUNK',
  'relationship_detail': {},
  'relevant_neighbor_prop_name': 'text',
  'relevant_neighbor_prop_value': 'https://chemycal.com/)   Chemycal是比利時布魯塞爾的化學資訊企業，主要業務在於追蹤來自世界各地對於化學品相關規範，並且隨時提供最新的化學品使用規範的整合資訊給用戶，幫助用戶企業與組織可以取得最新資訊，旨在幫助製造業維持供應鏈穩定及提升競爭力。My Chemical Monitoring是需付費的訂閱制網頁服務，提供化學品相關的\n計畫工作內容  \n15  \n最新規範及異動，讓使用者更易於篩選危害性化學品，並尋找替代品。 4. Pharos (https://pharosproject.net/)    健康建築網路（Healthy Building Network'},
 {'source_labels': {'Chunk'},
  'source_name': 'Chemical Detection',
  'relationship_type': 'FROM_CHUNK',
  'relationship_detail': {},
  'relevant_neighbor_prop_name': 'text',
  'relevant_neighbor_prop_value': 'ease Database 皮膚與呼吸敏感 全球 所有 非強制 613 \n15 MA TURA Chemical list 美國麻州政府 US Massachusetts Toxics Use Reduction Act  美國麻州 所有 非強制 1725 \n16 EDF - Chemical Detection Project: Full List of Chemicals Detected  Environmental Defense Fund (EDF) - Chemical Detection Project: Fu