In [1]:
import os
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache
from lightrag.llm import ollama_embedding
from lightrag.utils import EmbeddingFunc
from dotenv import load_dotenv
import nest_asyncio 
nest_asyncio.apply() 

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
WORKING_DIR = "../data/test_neo4j"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

In [3]:
async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        "qwen-plus",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
        **kwargs,
    )

### LightRAG

In [4]:
rag = LightRAG(
    working_dir=WORKING_DIR,
    workspace="test_postgres_storage",
    llm_model_func=llm_model_func,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=512,
        func=lambda texts: ollama_embedding(
            texts, embed_model="viosay/conan-embedding-v1:latest", host="http://192.168.69.234:11343"
        ),
    ),
    chunk_token_size=500,
    log_level="DEBUG",
    vector_storage="PostgresVectorDBStorage",
    graph_storage="Neo4JStorage"
)

# rag.vector_db_storage_cls.db = postgresql_db

INFO:lightrag:Logger initialized for working directory: ../data/test_neo4j
DEBUG:lightrag:LightRAG init with param:
  working_dir = ../data/test_neo4j,
  workspace = test_postgres_storage,
  kv_storage = JsonKVStorage,
  vector_storage = PostgresVectorDBStorage,
  graph_storage = Neo4JStorage,
  log_level = DEBUG,
  chunk_token_size = 500,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f3dfe4022a0>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function llm_model_func at 0x7f3dfe402020>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_model_max_

INFO:lightrag:Tables created successfully
INFO:lightrag:Tables created successfully
INFO:lightrag:Tables created successfully


In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader

doc_path = "../data/paper/scholat_paper_ed/scholat_paper_ed_001.csv"

loader = CSVLoader(doc_path)
data = loader.load()

data = [d.page_content for d in data]
need_to_insert_data = data[4:10]
need_to_insert_data

['title: 基于WWW的交互式网络课件系统的开发技术\nauthors: 傅秀芬，汤庸\nsource: 计算机工程与应用\nsourceDetail: \ndate: 1998.-\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时态变量“Now”语义及相应时态关系运算\nauthors: 叶小平，汤庸\nsource: 软件学报，2005，16（5）：838-845\nsourceDetail: \ndate: 2005.05\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 计算机支持的协同工作概观\nauthors: 汤庸\nsource: 工业工程,1999,2(003):10-12\nsourceDetail: \ndate: 1999.01\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时态知识和时态数据的统一模型研究\nauthors: 汤庸，汤娜，叶小平，冯智圣，肖炜\nsource: 软件学报，2003，14(S),74-79 【EI】\nsourceDetail: \ndate: 2003.11\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 基于描述逻辑的CIM模型\nauthors: 蒋运承 汤庸 王驹 周生明\nsource: 微电子学与计算机,2007,24(012):55-58\nsourceDetail: \ndate: 2007.-\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: CD—ROM信息存储与检索技术\nauthors: 汤庸\nsource: 计算机时代     1996年 第01期\nsourceDetail: \ndate: 1996.01\ntype: 期刊论文\nkeyword: \nsummary: ']

In [6]:
print(rag.entities_vdb)
print(rag.relationships_vdb)
print(rag.chunks_vdb)

EntityStorage(namespace='entities', global_config={'working_dir': '../data/test_neo4j', 'workspace': 'test_postgres_storage', 'kv_storage': 'JsonKVStorage', 'vector_storage': 'PostgresVectorDBStorage', 'graph_storage': 'Neo4JStorage', 'log_level': 'DEBUG', 'chunk_token_size': 500, 'chunk_overlap_token_size': 100, 'tiktoken_model_name': 'gpt-4o-mini', 'entity_extract_max_gleaning': 1, 'entity_summary_to_max_tokens': 500, 'node_embedding_algorithm': 'node2vec', 'node2vec_params': {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3}, 'embedding_func': {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f3dfe4022a0>}, 'embedding_batch_num': 32, 'embedding_func_max_async': 16, 'llm_model_func': <function llm_model_func at 0x7f3dfe402020>, 'llm_model_name': 'meta-llama/Llama-3.2-1B-Instruct', 'llm_model_max_token_size': 32768, 'llm_model_max_async': 16, 'llm_model_kwargs': {}, 'vector_db_storage_cls_kwargs':

In [7]:
print(rag.vector_storage_cls)

<function PostgresStorageFactory.get_storage_class at 0x7f3dff063e20>


In [8]:
print(rag.entities_vdb.storage.engine.url)

postgresql+asyncpg://postgres:***@localhost:6024/test_postgres


In [9]:
# await rag.entities_vdb.storage.init_tables()

In [12]:
await rag.ainsert(need_to_insert_data)

INFO:lightrag:[New Docs] inserting 6 docs
INFO:lightrag:[New Chunks] inserting 6 chunks
DEBUG:lightrag:[upserting chunks] {'chunk-c3ddc19663e979c22961236d629d7faa': {'tokens': 60, 'content': 'title: 基于WWW的交互式网络课件系统的开发技术\nauthors: 傅秀芬，汤庸\nsource: 计算机工程与应用\nsourceDetail: \ndate: 1998.-\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-c3ddc19663e979c22961236d629d7faa'}, 'chunk-49dcb89e79b16caced258e08371122e5': {'tokens': 71, 'content': 'title: 时态变量“Now”语义及相应时态关系运算\nauthors: 叶小平，汤庸\nsource: 软件学报，2005，16（5）：838-845\nsourceDetail: \ndate: 2005.05\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-49dcb89e79b16caced258e08371122e5'}, 'chunk-52c7b60eb2f46bca990151794d9339c5': {'tokens': 61, 'content': 'title: 计算机支持的协同工作概观\nauthors: 汤庸\nsource: 工业工程,1999,2(003):10-12\nsourceDetail: \ndate: 1999.01\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-52c7b60eb2f46bca990151794d9339c5'}, 'chunk-73f68d1cc6f8d7d1f23e

⠙ Processed 1 chunks, 8 entities(duplicated), 7 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠹ Processed 2 chunks, 16 entities(duplicated), 13 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠸ Processed 3 chunks, 23 entities(duplicated), 19 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠼ Processed 4 chunks, 31 entities(duplicated), 26 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠴ Processed 5 chunks, 41 entities(duplicated), 35 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠦ Processed 6 chunks, 53 entities(duplicated), 46 relations(duplicated)

DEBUG:lightrag:Node WEB技术 is new to graph.
DEBUG:lightrag:Original description tokens: 24
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for WEB技术: Web技术是指用于构建和运行Web应用程序的技术集合，包括HTML、CSS、JavaScript等。
DEBUG:lightrag:Node 时态变量'NOW'语义及相应时态关系运算 is new to graph.
DEBUG:lightrag:Original description tokens: 26
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 时态变量'NOW'语义及相应时态关系运算: 这是一篇探讨时态变量'Now'的语义及其时态关系运算的学术论文。
DEBUG:lightrag:get_node: query: MATCH (n {displayName: $entity_name}) RETURN n, result: {'entity_type': 'PERSON', 'displayName': '傅秀芬', 'description': '傅秀芬是《基于WWW的交互式网络课件系统的开发技术》的作者之一。', 'source_id': 'chunk-c3ddc19663e979c22961236d629d7faa'}
DEBUG:lightrag:Node 傅秀芬 already exists in graph.
DEBUG:lightrag:Original description tokens: 23
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 傅秀芬: 傅秀芬是《基于WWW的交互式网络课件系统的开发技术》的作者之一。
DEBUG:l




DEBUG:lightrag:get_node: query: MATCH (n {displayName: $entity_name}) RETURN n, result: {'entity_type': 'JOURNAL', 'displayName': '软件学报', 'description': '《软件学报》是一份学术期刊，发表了关于描述逻辑εL的研究论文。<SEP>软件学报是一份学术期刊，发表了叶小平和汤庸的论文。<SEP>软件学报是一份学术期刊，发表了汤庸等人的论文。', 'source_id': 'chunk-49f2b8482496f51a4af2e1fb0d092e5b<SEP>chunk-49dcb89e79b16caced258e08371122e5<SEP>chunk-73f68d1cc6f8d7d1f23ed5eea890ca32'}
DEBUG:lightrag:Node 软件学报 already exists in graph.
DEBUG:lightrag:Original description tokens: 124
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 软件学报: 《软件学报》是一份学术期刊，发表了关于描述逻辑εL的研究论文。<SEP>软件学报是一份学术期刊，发表了叶小平和汤庸的论文。<SEP>软件学报是一份学术期刊，发表了汤庸等人的论文。<SEP>软件学报是一份学术期刊，发表了关于时态变量'Now'的研究论文。<SEP>软件学报是发表论文《时态知识和时态数据的统一模型研究》的学术期刊。
DEBUG:lightrag:get_node: query: MATCH (n {displayName: $entity_name}) RETURN n, result: {'entity_type': 'DATE', 'displayName': '2005.05', 'description': '这是论文《时态变量‘Now’语义及相应时态关系运算》的发表日期。', 'source_id': 'chunk-49dcb89e79b16caced258e08371122e5

KG successfully indexed.


### 可视化本地图为网页

In [None]:
import networkx as nx
from pyvis.network import Network
import random

# Load the GraphML file
G = nx.read_graphml("../data/test_paper/graph_chunk_entity_relation.graphml")

# Create a Pyvis network
net = Network(height="100vh", notebook=True)

# Convert NetworkX graph to Pyvis network
net.from_nx(G)

# Add colors to nodes
for node in net.nodes:
    node["color"] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

# Save and display the network
net.show("../data/test_paper/knowledge_graph.html")


### 可视化本地图到 neo4j

In [None]:
import os
import json
from lightrag.utils import xml_to_json
from neo4j import GraphDatabase

# Constants
BATCH_SIZE_NODES = 500
BATCH_SIZE_EDGES = 100

# Neo4j connection credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678"


def convert_xml_to_json(xml_path, output_path):
    """Converts XML file to JSON and saves the output."""
    if not os.path.exists(xml_path):
        print(f"Error: File not found - {xml_path}")
        return None

    json_data = xml_to_json(xml_path)
    if json_data:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"JSON file created: {output_path}")
        return json_data
    else:
        print("Failed to create JSON data")
        return None


def process_in_batches(tx, query, data, batch_size):
    """Process data in batches and execute the given query."""
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        tx.run(query, {"nodes": batch} if "nodes" in query else {"edges": batch})


def main():
    # Paths
    xml_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
    json_file = os.path.join(WORKING_DIR, "graph_data.json")

    # Convert XML to JSON
    json_data = convert_xml_to_json(xml_file, json_file)
    if json_data is None:
        return

    # Load nodes and edges
    nodes = json_data.get("nodes", [])
    edges = json_data.get("edges", [])

    # Neo4j queries
    create_nodes_query = """
    UNWIND $nodes AS node
    MERGE (e:Entity {id: node.id})
    SET e.entity_type = node.entity_type,
        e.description = node.description,
        e.source_id = node.source_id,
        e.displayName = node.id
    REMOVE e:Entity
    WITH e, node
    CALL apoc.create.addLabels(e, [node.entity_type]) YIELD node AS labeledNode
    RETURN count(*)
    """

    create_edges_query = """
    UNWIND $edges AS edge
    MATCH (source {id: edge.source})
    MATCH (target {id: edge.target})
    WITH source, target, edge,
         CASE
            WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
            WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
            WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
            WHEN edge.keywords CONTAINS 'located' THEN 'located'
            WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
           ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
         END AS relType
    CALL apoc.create.relationship(source, relType, {
      weight: edge.weight,
      description: edge.description,
      keywords: edge.keywords,
      source_id: edge.source_id
    }, target) YIELD rel
    RETURN count(*)
    """

    set_displayname_and_labels_query = """
    MATCH (n)
    SET n.displayName = n.id
    WITH n
    CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
    RETURN count(*)
    """

    # Create a Neo4j driver
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    try:
        # Execute queries in batches
        with driver.session() as session:
            # Insert nodes in batches
            session.execute_write(
                process_in_batches, create_nodes_query, nodes, BATCH_SIZE_NODES
            )

            # Insert edges in batches
            session.execute_write(
                process_in_batches, create_edges_query, edges, BATCH_SIZE_EDGES
            )

            # Set displayName and labels
            session.run(set_displayname_and_labels_query)

    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.close()


main()


## 问答

In [10]:
res = await rag.aquery(
        "汤庸发表过哪些论文？",
        param=QueryParam(mode="hybrid", only_need_context=True),
    )

print(len(res))
print(res)

INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
DEBUG:lightrag:local query 从实体数据库中获取的信息 [{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x7f3dfd064ef0>, 'workspace': 'test_postgres_storage', 'createtime': datetime.datetime(2024, 11, 20, 11, 54, 16, 815444), 'entity_name': '1996年 第01期', 'embedding': array([-0.00242454,  0.02155505, -0.0056826 , ...,  0.04383247,
       -0.02738889,  0.04719466], dtype=float32), 'content': '1996年 第01期1996年 第01期是《计算机时代》发布汤庸论文的具体时间。<SEP>1996年 第01期是论文《CD—ROM信息存储与检索技术》的发表时间。', 'id': 'ent-d88c33fefb6fb7e8335e1314238f3ea0', 'updatetime': None}, {'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x7f3f79d31fd0>, 'workspace': 'test_postgres_storage', 'createtime': datetime.datetime(2024, 11, 20, 11, 54, 16, 815444), 'entity_name': '74-79', 'embedding': array([-0.00449925,  0.01295783, -0.03631313, ...,  0.1012028 ,
       -0.01351161, -0.01450292], dtype=float32), 'content': '74-7974-7

6750

-----Entities-----
```csv
id,	entity,	type,	description,	rank
1,	基于WWW的交互式网络课件系统的开发技术,PAPER,这是一篇关于基于Web的交互式网络课件系统开发技术的研究论文。<SEP>这是一篇探讨基于万维网的交互式网络课件系统开发技术的学术论文。,11
2,	软件学报,JOURNAL,《软件学报》是一份学术期刊，发表了关于描述逻辑εL的研究论文。<SEP>软件学报是一份学术期刊，发表了叶小平和汤庸的论文。<SEP>软件学报是一份学术期刊，发表了汤庸等人的论文。<SEP>软件学报是一份学术期刊，发表了关于时态变量'Now'的研究论文。<SEP>软件学报是发表论文《时态知识和时态数据的统一模型研究》的学术期刊。,6
3,	计算机支持的协同工作概观,PAPER,《计算机支持的协同工作概观》是1999年发表的一篇期刊论文，探讨了计算机支持的协同工作。<SEP>这是汤庸撰写的一篇论文，概述了计算机支持的协同工作领域。,10
4,	1996年 第01期,DATE,1996年 第01期是《计算机时代》发布汤庸论文的具体时间。<SEP>1996年 第01期是论文《CD—ROM信息存储与检索技术》的发表时间。,1
5,	74-79,TYPE,74-79是论文《时态知识和时态数据的统一模型研究》在《软件学报》14(S)期中的页码范围。,1
6,	2003.11,DATE,2003年11月是论文《时态知识和时态数据的统一模型研究》的发表日期。<SEP>2003年11月是论文发表的时间。,1
7,	计算机支持,CONCEPT,计算机支持是指利用信息技术来辅助或增强协同工作的效率和效果，是论文《计算机支持的协同工作概观》研究的重点。,1
8,	2(003):10-12,DATE,论文在期刊《工业工程》中的具体页码和期数，提供了文献引用的详细信息。<SEP>论文的具体卷期和页码，指出了论文在期刊中的位置。,2
9,	协同工作,CONCEPT,协同工作是指多人协作完成任务的过程，是论文《计算机支持的协同工作概观》讨论的核心概念。,1
10,	1999.01,DATE,1999年1月，论文《计算机支持的协同工作概观》发表。<SEP>论文发表的具体日期为1999年1月。,2
11,	16（5）：838-845,TYP

In [None]:
await rag.adelete_by_entity("汤庸")