In [None]:
import os
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache
from lightrag.llm import ollama_embedding
from lightrag.utils import EmbeddingFunc
from dotenv import load_dotenv
import nest_asyncio 
nest_asyncio.apply() 

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
WORKING_DIR = "../data/test_postgres"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

In [3]:
async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        "qwen-plus",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
        **kwargs,
    )

In [4]:
rag = LightRAG(
    working_dir=WORKING_DIR,
    workspace="test_postgres_storage",
    # kg="Neo4JStorage",
    llm_model_func=llm_model_func,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=512,
        func=lambda texts: ollama_embedding(
            texts, embed_model="viosay/conan-embedding-v1:latest", host="http://192.168.69.234:11343"
        ),
    ),
    chunk_token_size=500,
    log_level="DEBUG",
    vector_storage="PostgresVectorDBStorage"
)

# rag.vector_db_storage_cls.db = postgresql_db

INFO:lightrag:Logger initialized for working directory: ../data/test_postgres
DEBUG:lightrag:LightRAG init with param:
  working_dir = ../data/test_postgres,
  workspace = test_postgres_storage,
  kv_storage = JsonKVStorage,
  vector_storage = PostgresVectorDBStorage,
  graph_storage = NetworkXStorage,
  log_level = DEBUG,
  chunk_token_size = 500,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f6084b46d40>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function llm_model_func at 0x7f6084b472e0>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_m

INFO:lightrag:Tables created successfully


In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader

doc_path = "../data/paper/scholat_paper_ed/scholat_paper_ed_001.csv"

loader = CSVLoader(doc_path)
data = loader.load()

data = [d.page_content for d in data]
need_to_insert_data = data[2:3]
need_to_insert_data

['title: 基于描述逻辑的带属性依赖时序ER模型\nauthors: 蒋运承\xa0汤庸\xa0王驹\xa0冀高峰\nsource: 计算机研究与发展，2007，44（10）: 1765～1773\nsourceDetail: \ndate: 2007.10\ntype: 期刊论文\nkeyword: \nsummary: ']

In [6]:
print(rag.entities_vdb)
print(rag.relationships_vdb)
print(rag.chunks_vdb)

EntityStorage(namespace='entities', global_config={'working_dir': '../data/test_postgres', 'workspace': 'test_postgres_storage', 'kv_storage': 'JsonKVStorage', 'vector_storage': 'PostgresVectorDBStorage', 'graph_storage': 'NetworkXStorage', 'log_level': 'DEBUG', 'chunk_token_size': 500, 'chunk_overlap_token_size': 100, 'tiktoken_model_name': 'gpt-4o-mini', 'entity_extract_max_gleaning': 1, 'entity_summary_to_max_tokens': 500, 'node_embedding_algorithm': 'node2vec', 'node2vec_params': {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3}, 'embedding_func': {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f6084b46d40>}, 'embedding_batch_num': 32, 'embedding_func_max_async': 16, 'llm_model_func': <function llm_model_func at 0x7f6084b472e0>, 'llm_model_name': 'meta-llama/Llama-3.2-1B-Instruct', 'llm_model_max_token_size': 32768, 'llm_model_max_async': 16, 'llm_model_kwargs': {}, 'vector_db_storage_cls_kw

In [7]:
print(rag.vector_storage_cls)

<function PostgresStorageFactory.get_storage_class at 0x7f6085a64360>


In [8]:
print(rag.entities_vdb.storage.engine.url)

postgresql+asyncpg://postgres:***@localhost:6024/test_postgres


In [9]:
# await rag.entities_vdb.storage.init_tables()

In [10]:
await rag.ainsert(need_to_insert_data)

INFO:lightrag:[New Docs] inserting 1 docs
INFO:lightrag:[New Chunks] inserting 1 chunks
DEBUG:lightrag:[upserting chunks] {'chunk-8d196e1727cc81a52391584220a9e78a': {'tokens': 85, 'content': 'title: 基于描述逻辑的带属性依赖时序ER模型\nauthors: 蒋运承\xa0汤庸\xa0王驹\xa0冀高峰\nsource: 计算机研究与发展，2007，44（10）: 1765～1773\nsourceDetail: \ndate: 2007.10\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-8d196e1727cc81a52391584220a9e78a'}}
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:lightrag:[Entity Extraction]...
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠙ Processed 1 chunks, 9 entities(duplicated), 8 relations(duplicated)

DEBUG:lightrag:Node 蒋运承 is new to graph.
DEBUG:lightrag:Original description tokens: 29
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 蒋运承: 蒋运承是该论文的作者之一，专注于描述逻辑的带属性依赖时序ER模型的研究。
DEBUG:lightrag:Upserted node 蒋运承 into knowledge graph.
DEBUG:lightrag:Node 汤庸 is new to graph.
DEBUG:lightrag:Original description tokens: 27
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 汤庸: 汤庸是该论文的作者之一，参与了描述逻辑的带属性依赖时序ER模型的研究。
DEBUG:lightrag:Upserted node 汤庸 into knowledge graph.
DEBUG:lightrag:Node 王驹 is new to graph.
DEBUG:lightrag:Original description tokens: 27
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 王驹: 王驹是该论文的作者之一，贡献了描述逻辑的带属性依赖时序ER模型的研究。
DEBUG:lightrag:Upserted node 王驹 into knowledge graph.
DEBUG:lightrag:Node 冀高峰 is new to graph.
DEBUG:lightrag:Original description tokens: 28
DEBUG:lightrag:Description is short enough, no need to s




INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
DEBUG:lightrag:[upserting relationship] {'rel-53dcf6184a97e0530d2a369e8c8223ad': {'src_id': '基于描述逻辑的带属性依赖时序ER模型', 'tgt_id': '蒋运承', 'content': '作者, 研究贡献基于描述逻辑的带属性依赖时序ER模型蒋运承蒋运承是论文的主要作者之一，对研究内容有直接贡献。'}, 'rel-f4b7a64d5c50e1ffc95f

### 可视化本地图为网页

In [None]:
import networkx as nx
from pyvis.network import Network
import random

# Load the GraphML file
G = nx.read_graphml("../data/test_paper/graph_chunk_entity_relation.graphml")

# Create a Pyvis network
net = Network(height="100vh", notebook=True)

# Convert NetworkX graph to Pyvis network
net.from_nx(G)

# Add colors to nodes
for node in net.nodes:
    node["color"] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

# Save and display the network
net.show("../data/test_paper/knowledge_graph.html")


### 可视化本地图到 neo4j

In [11]:
import os
import json
from lightrag.utils import xml_to_json
from neo4j import GraphDatabase

# Constants
BATCH_SIZE_NODES = 500
BATCH_SIZE_EDGES = 100

# Neo4j connection credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678"


def convert_xml_to_json(xml_path, output_path):
    """Converts XML file to JSON and saves the output."""
    if not os.path.exists(xml_path):
        print(f"Error: File not found - {xml_path}")
        return None

    json_data = xml_to_json(xml_path)
    if json_data:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"JSON file created: {output_path}")
        return json_data
    else:
        print("Failed to create JSON data")
        return None


def process_in_batches(tx, query, data, batch_size):
    """Process data in batches and execute the given query."""
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        tx.run(query, {"nodes": batch} if "nodes" in query else {"edges": batch})


def main():
    # Paths
    xml_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
    json_file = os.path.join(WORKING_DIR, "graph_data.json")

    # Convert XML to JSON
    json_data = convert_xml_to_json(xml_file, json_file)
    if json_data is None:
        return

    # Load nodes and edges
    nodes = json_data.get("nodes", [])
    edges = json_data.get("edges", [])

    # Neo4j queries
    create_nodes_query = """
    UNWIND $nodes AS node
    MERGE (e:Entity {id: node.id})
    SET e.entity_type = node.entity_type,
        e.description = node.description,
        e.source_id = node.source_id,
        e.displayName = node.id
    REMOVE e:Entity
    WITH e, node
    CALL apoc.create.addLabels(e, [node.entity_type]) YIELD node AS labeledNode
    RETURN count(*)
    """

    create_edges_query = """
    UNWIND $edges AS edge
    MATCH (source {id: edge.source})
    MATCH (target {id: edge.target})
    WITH source, target, edge,
         CASE
            WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
            WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
            WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
            WHEN edge.keywords CONTAINS 'located' THEN 'located'
            WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
           ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
         END AS relType
    CALL apoc.create.relationship(source, relType, {
      weight: edge.weight,
      description: edge.description,
      keywords: edge.keywords,
      source_id: edge.source_id
    }, target) YIELD rel
    RETURN count(*)
    """

    set_displayname_and_labels_query = """
    MATCH (n)
    SET n.displayName = n.id
    WITH n
    CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
    RETURN count(*)
    """

    # Create a Neo4j driver
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    try:
        # Execute queries in batches
        with driver.session() as session:
            # Insert nodes in batches
            session.execute_write(
                process_in_batches, create_nodes_query, nodes, BATCH_SIZE_NODES
            )

            # Insert edges in batches
            session.execute_write(
                process_in_batches, create_edges_query, edges, BATCH_SIZE_EDGES
            )

            # Set displayName and labels
            session.run(set_displayname_and_labels_query)

    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.close()


main()


Root element: {http://graphml.graphdrawing.org/xmlns}graphml
Root attributes: {'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'}
Found 9 nodes and 8 edges
JSON file created: ../data/test_postgres/graph_data.json


## 问答

In [12]:
print(
    await rag.aquery(
        "汤庸发表过哪些论文？",
        param=QueryParam(mode="hybrid", only_need_context=True),
    )
)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
DEBUG:lightrag:local query 从实体数据库中获取的信息 [{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x7f607f19fa70>, 'id': 'ent-e773e8b7965d835b17b91f9e3f9b3c01', 'content': '汤庸汤庸是该论文的作者之一，参与了描述逻辑的带属性依赖时序ER模型的研究。', 'updatetime': None, 'workspace': 'default', 'entity_name': '汤庸', 'createtime': datetime.datetime(2024, 11, 19, 9, 32, 31, 133517), 'embedding': array([ 0.00365444,  0.00635745, -0.01593765, ...,  0.03188277,
       -0.00487912,  0.0245876 ], dtype=float32)}, {'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x7f607f19fe30>, 'id': 'ent-fa67366bc555b0232440fea3273f1c18', 'content': '王驹王驹是该论文的作者之一，贡献了描述逻辑的带属性依赖时序ER模型的研究。', 'updatetime': None, 'workspace': 'default', 'entity_name': '王驹', 'createtime': datetime.datetime(2024, 11, 19, 9, 32, 31, 133517), 


-----Entities-----
```csv
id,	entity,	type,	description,	rank
1,	冀高峰,PERSON,冀高峰是该论文的作者之一，参与了描述逻辑的带属性依赖时序ER模型的研究。,1
2,	王驹,PERSON,王驹是该论文的作者之一，贡献了描述逻辑的带属性依赖时序ER模型的研究。,1
3,	期刊论文,TYPE,期刊论文是指在学术期刊上发表的研究成果，本例中指《基于描述逻辑的带属性依赖时序ER模型》。,1
4,	蒋运承,PERSON,蒋运承是该论文的作者之一，专注于描述逻辑的带属性依赖时序ER模型的研究。,1
5,	基于描述逻辑的带属性依赖时序ER模型,PAPER,这是一篇学术论文，探讨了描述逻辑的带属性依赖时序ER模型。,8
6,	44（10）: 1765～1773,DATE,这是论文《基于描述逻辑的带属性依赖时序ER模型》在《计算机研究与发展》期刊上发表的具体期号和页码范围。,1
7,	2007.10,DATE,2007年10月，这是论文《基于描述逻辑的带属性依赖时序ER模型》发表的时间。,1
8,	汤庸,PERSON,汤庸是该论文的作者之一，参与了描述逻辑的带属性依赖时序ER模型的研究。,1
9,	计算机研究与发展,JOURNAL,《计算机研究与发展》是一份学术期刊，发表了关于描述逻辑的带属性依赖时序ER模型的研究论文。,1
```
-----Relationships-----
```csv
id,	source,	target,	description,	keywords,	weight,	rank
1,	基于描述逻辑的带属性依赖时序ER模型,王驹,王驹是论文的合著者，对研究内容有贡献。,合著者, 研究贡献,9.0,9
2,	基于描述逻辑的带属性依赖时序ER模型,计算机研究与发展,论文《基于描述逻辑的带属性依赖时序ER模型》发表在《计算机研究与发展》期刊上。,发表, 学术传播,10.0,9
3,	基于描述逻辑的带属性依赖时序ER模型,期刊论文,论文《基于描述逻辑的带属性依赖时序ER模型》属于期刊论文类型。,论文类型, 分类,10.0,9
4,	基于描述逻辑的带属性依赖时序ER模型,蒋运承,蒋运承是论文的主要作者之一，对研究内容有直接贡献。,作者, 研究贡献,10.0,9
5,	44（10）: 1765～1773