In [1]:
import os
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache
from lightrag.llm import ollama_embedding
from lightrag.utils import EmbeddingFunc
from dotenv import load_dotenv
import nest_asyncio 
nest_asyncio.apply() 

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
WORKING_DIR = "../data/test_json"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

In [3]:
async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        "qwen-plus",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
        **kwargs,
    )

### LightRAG

In [4]:
rag = LightRAG(
    working_dir=WORKING_DIR,
    workspace="test_json",
    llm_model_func=llm_model_func,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=512,
        func=lambda texts: ollama_embedding(
            texts, embed_model="viosay/conan-embedding-v1:latest", host="http://192.168.69.234:11343"
        ),
    ),
    chunk_token_size=500,
    log_level="DEBUG",
    # vector_storage="PostgresVectorDBStorage",
    # graph_storage="Neo4JStorage"
)

# rag.vector_db_storage_cls.db = postgresql_db

INFO:lightrag:Logger initialized for working directory: ../data/test_json
DEBUG:lightrag:LightRAG init with param:
  working_dir = ../data/test_json,
  workspace = test_json,
  kv_storage = JsonKVStorage,
  vector_storage = NanoVectorDBStorage,
  graph_storage = NetworkXStorage,
  log_level = DEBUG,
  chunk_token_size = 500,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f30d37be0c0>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function llm_model_func at 0x7f30d37bdd00>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_model_max_async = 16,
  l

In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader

doc_path = "../data/paper/scholat_paper_ed/scholat_paper_ed_001.csv"

loader = CSVLoader(doc_path)
data = loader.load()

data = [d.page_content for d in data]
need_to_insert_data = data[4:10]
need_to_insert_data

['title: 基于WWW的交互式网络课件系统的开发技术\nauthors: 傅秀芬，汤庸\nsource: 计算机工程与应用\nsourceDetail: \ndate: 1998.-\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时态变量“Now”语义及相应时态关系运算\nauthors: 叶小平，汤庸\nsource: 软件学报，2005，16（5）：838-845\nsourceDetail: \ndate: 2005.05\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 计算机支持的协同工作概观\nauthors: 汤庸\nsource: 工业工程,1999,2(003):10-12\nsourceDetail: \ndate: 1999.01\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时态知识和时态数据的统一模型研究\nauthors: 汤庸，汤娜，叶小平，冯智圣，肖炜\nsource: 软件学报，2003，14(S),74-79 【EI】\nsourceDetail: \ndate: 2003.11\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 基于描述逻辑的CIM模型\nauthors: 蒋运承 汤庸 王驹 周生明\nsource: 微电子学与计算机,2007,24(012):55-58\nsourceDetail: \ndate: 2007.-\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: CD—ROM信息存储与检索技术\nauthors: 汤庸\nsource: 计算机时代     1996年 第01期\nsourceDetail: \ndate: 1996.01\ntype: 期刊论文\nkeyword: \nsummary: ']

In [6]:
print(rag.entities_vdb)
print(rag.relationships_vdb)
print(rag.chunks_vdb)

NanoVectorDBStorage(namespace='entities', global_config={'working_dir': '../data/test_json', 'workspace': 'test_json', 'kv_storage': 'JsonKVStorage', 'vector_storage': 'NanoVectorDBStorage', 'graph_storage': 'NetworkXStorage', 'log_level': 'DEBUG', 'chunk_token_size': 500, 'chunk_overlap_token_size': 100, 'tiktoken_model_name': 'gpt-4o-mini', 'entity_extract_max_gleaning': 1, 'entity_summary_to_max_tokens': 500, 'node_embedding_algorithm': 'node2vec', 'node2vec_params': {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3}, 'embedding_func': {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f30d37be0c0>}, 'embedding_batch_num': 32, 'embedding_func_max_async': 16, 'llm_model_func': <function llm_model_func at 0x7f30d37bdd00>, 'llm_model_name': 'meta-llama/Llama-3.2-1B-Instruct', 'llm_model_max_token_size': 32768, 'llm_model_max_async': 16, 'llm_model_kwargs': {}, 'vector_db_storage_cls_kwargs': {}, 'en

In [7]:
print(rag.vector_storage_cls)

<class 'lightrag.storage.NanoVectorDBStorage'>


In [8]:
# await rag.entities_vdb.storage.init_tables()

In [9]:
await rag.ainsert(need_to_insert_data)

INFO:lightrag:[New Docs] inserting 6 docs
INFO:lightrag:[New Chunks] inserting 6 chunks
DEBUG:lightrag:[upserting chunks] {'chunk-c3ddc19663e979c22961236d629d7faa': {'tokens': 60, 'content': 'title: 基于WWW的交互式网络课件系统的开发技术\nauthors: 傅秀芬，汤庸\nsource: 计算机工程与应用\nsourceDetail: \ndate: 1998.-\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-c3ddc19663e979c22961236d629d7faa'}, 'chunk-49dcb89e79b16caced258e08371122e5': {'tokens': 71, 'content': 'title: 时态变量“Now”语义及相应时态关系运算\nauthors: 叶小平，汤庸\nsource: 软件学报，2005，16（5）：838-845\nsourceDetail: \ndate: 2005.05\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-49dcb89e79b16caced258e08371122e5'}, 'chunk-52c7b60eb2f46bca990151794d9339c5': {'tokens': 61, 'content': 'title: 计算机支持的协同工作概观\nauthors: 汤庸\nsource: 工业工程,1999,2(003):10-12\nsourceDetail: \ndate: 1999.01\ntype: 期刊论文\nkeyword: \nsummary:', 'chunk_order_index': 0, 'full_doc_id': 'doc-52c7b60eb2f46bca990151794d9339c5'}, 'chunk-73f68d1cc6f8d7d1f23e

⠙ Processed 1 chunks, 7 entities(duplicated), 6 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠹ Processed 2 chunks, 15 entities(duplicated), 13 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠸ Processed 3 chunks, 23 entities(duplicated), 20 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠼ Processed 4 chunks, 33 entities(duplicated), 29 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠴ Processed 5 chunks, 47 entities(duplicated), 41 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠦ Processed 6 chunks, 58 entities(duplicated), 50 relations(duplicated)

DEBUG:lightrag:Node 傅秀芬 is new to graph.
DEBUG:lightrag:Original description tokens: 28
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 傅秀芬: 傅秀芬是该论文的作者之一，专注于基于WWW的交互式网络课件系统的技术开发。
DEBUG:lightrag:Upserted node 傅秀芬 into knowledge graph.
DEBUG:lightrag:Node 汤庸 is new to graph.
DEBUG:lightrag:Original description tokens: 156
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 汤庸: 汤庸是论文《基于描述逻辑的CIM模型》的作者之一。<SEP>汤庸是该研究论文的另一位作者，与叶小平合作进行了关于时态变量‘Now’的研究。<SEP>汤庸是该论文的作者之一，专注于时态知识和时态数据的统一模型研究。<SEP>汤庸是该论文的作者，研究CD—ROM信息存储与检索技术。<SEP>汤庸是该论文的另一位作者，与傅秀芬合作研究基于WWW的交互式网络课件系统的技术开发。<SEP>汤庸是这篇关于计算机支持的协同工作的论文的作者。
DEBUG:lightrag:Upserted node 汤庸 into knowledge graph.
DEBUG:lightrag:Node 计算机工程与应用 is new to graph.
DEBUG:lightrag:Original description tokens: 57
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for 计算机工程与应用: 《计算机工程与应用》不仅是一本期刊，也是一个组织，负责出版和推广计算机科学领域的




DEBUG:lightrag:Final description for 信息存储与检索技术: 信息存储与检索技术是指用于组织、存储和访问大量信息的方法和技术，是论文《CD—ROM信息存储与检索技术》的主要研究对象。
DEBUG:lightrag:Upserted node 信息存储与检索技术 into knowledge graph.
DEBUG:lightrag:Node CD—ROM is new to graph.
DEBUG:lightrag:Original description tokens: 37
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for CD—ROM: CD—ROM是一种光盘存储介质，用于长期存储大量数据，是论文《CD—ROM信息存储与检索技术》讨论的技术之一。
DEBUG:lightrag:Upserted node CD—ROM into knowledge graph.
DEBUG:lightrag:Edge from 傅秀芬 to 基于WWW的交互式网络课件系统的开发技术 is new to graph.
DEBUG:lightrag:Original description tokens: 28
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for edge 傅秀芬->基于WWW的交互式网络课件系统的开发技术: 傅秀芬是该论文的主要贡献者之一，参与了基于WWW的交互式网络课件系统的研究。
DEBUG:lightrag:Upserted edge from 傅秀芬 to 基于WWW的交互式网络课件系统的开发技术 into knowledge graph.
DEBUG:lightrag:Edge from 基于WWW的交互式网络课件系统的开发技术 to 汤庸 is new to graph.
DEBUG:lightrag:Original description tokens: 17
DEBUG:lightrag:Descript

### 可视化本地图为网页

In [None]:
import networkx as nx
from pyvis.network import Network
import random

# Load the GraphML file
G = nx.read_graphml("../data/test_paper/graph_chunk_entity_relation.graphml")

# Create a Pyvis network
net = Network(height="100vh", notebook=True)

# Convert NetworkX graph to Pyvis network
net.from_nx(G)

# Add colors to nodes
for node in net.nodes:
    node["color"] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

# Save and display the network
net.show("../data/test_paper/knowledge_graph.html")


### 可视化本地图到 neo4j

In [None]:
import os
import json
from lightrag.utils import xml_to_json
from neo4j import GraphDatabase

# Constants
BATCH_SIZE_NODES = 500
BATCH_SIZE_EDGES = 100

# Neo4j connection credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678"


def convert_xml_to_json(xml_path, output_path):
    """Converts XML file to JSON and saves the output."""
    if not os.path.exists(xml_path):
        print(f"Error: File not found - {xml_path}")
        return None

    json_data = xml_to_json(xml_path)
    if json_data:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"JSON file created: {output_path}")
        return json_data
    else:
        print("Failed to create JSON data")
        return None


def process_in_batches(tx, query, data, batch_size):
    """Process data in batches and execute the given query."""
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        tx.run(query, {"nodes": batch} if "nodes" in query else {"edges": batch})


def main():
    # Paths
    xml_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
    json_file = os.path.join(WORKING_DIR, "graph_data.json")

    # Convert XML to JSON
    json_data = convert_xml_to_json(xml_file, json_file)
    if json_data is None:
        return

    # Load nodes and edges
    nodes = json_data.get("nodes", [])
    edges = json_data.get("edges", [])

    # Neo4j queries
    create_nodes_query = """
    UNWIND $nodes AS node
    MERGE (e:Entity {id: node.id})
    SET e.entity_type = node.entity_type,
        e.description = node.description,
        e.source_id = node.source_id,
        e.displayName = node.id
    REMOVE e:Entity
    WITH e, node
    CALL apoc.create.addLabels(e, [node.entity_type]) YIELD node AS labeledNode
    RETURN count(*)
    """

    create_edges_query = """
    UNWIND $edges AS edge
    MATCH (source {id: edge.source})
    MATCH (target {id: edge.target})
    WITH source, target, edge,
         CASE
            WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
            WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
            WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
            WHEN edge.keywords CONTAINS 'located' THEN 'located'
            WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
           ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
         END AS relType
    CALL apoc.create.relationship(source, relType, {
      weight: edge.weight,
      description: edge.description,
      keywords: edge.keywords,
      source_id: edge.source_id
    }, target) YIELD rel
    RETURN count(*)
    """

    set_displayname_and_labels_query = """
    MATCH (n)
    SET n.displayName = n.id
    WITH n
    CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
    RETURN count(*)
    """

    # Create a Neo4j driver
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    try:
        # Execute queries in batches
        with driver.session() as session:
            # Insert nodes in batches
            session.execute_write(
                process_in_batches, create_nodes_query, nodes, BATCH_SIZE_NODES
            )

            # Insert edges in batches
            session.execute_write(
                process_in_batches, create_edges_query, edges, BATCH_SIZE_EDGES
            )

            # Set displayName and labels
            session.run(set_displayname_and_labels_query)

    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.close()


main()


## 问答

In [12]:
res = await rag.aquery(
        "汤庸发表过哪些论文？",
        param=QueryParam(mode="hybrid", only_need_context=True),
    )

print(len(res))
print(res)

INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
DEBUG:lightrag:local query 从实体数据库中获取的信息 [{'__id__': 'ent-0585ed0d327488153dcbb30d90d15301', 'entity_name': '研究成果', '__metrics__': 0.638345259133726, 'id': 'ent-0585ed0d327488153dcbb30d90d15301', 'distance': 0.638345259133726}, {'__id__': 'ent-f5dae3b54d87d6600c53c4612e77face', 'entity_name': '学术期刊', '__metrics__': 0.6285655485134733, 'id': 'ent-f5dae3b54d87d6600c53c4612e77face', 'distance': 0.6285655485134733}, {'__id__': 'ent-81cb1ca24a387a41897034e87b2cbf10', 'entity_name': '14(S),74-79', '__metrics__': 0.6170613033765846, 'id': 'ent-81cb1ca24a387a41897034e87b2cbf10', 'distance': 0.6170613033765846}, {'__id__': 'ent-6a4511b2a6364af279ad59862bd1b060', 'entity_name': '计算机科学领域', '__metrics__': 0.608059449569585, 'id': 'ent-6a4511b2a6364af279ad59862bd1b060', 'distance': 0.608059449569585}, {'__id__': 'ent-a033fbb8eaeb3993ab5d095f75881bf1', 'entity_name': '2005.05', '__metrics__': 0.604942668864432, 'id'

7759

-----Entities-----
```csv
id,	entity,	type,	description,	rank
1,	学术传播,CONCEPT,学术传播是指通过出版物、会议等方式分享研究成果的过程，《软件学报》促进了这一过程。,1
2,	基于WWW的交互式网络课件系统的开发技术,PAPER,这是一篇探讨基于WWW技术开发交互式网络课件系统的学术论文。,5
3,	CD—ROM信息存储与检索技术,PAPER,这篇论文探讨了CD—ROM的信息存储与检索技术，是汤庸的研究成果。,3
4,	1996.01,DATE,这是论文发表的年月，提供了研究的时间框架。,1
5,	时态逻辑,CONCEPT,时态逻辑是研究时间维度上命题真值变化的逻辑体系，本文研究的时态变量‘Now’是其一部分。,1
6,	1999.01,DATE,1999年1月是论文的发表日期。,1
7,	时态知识和时态数据的统一模型研究,PAPER,这是由汤庸等人撰写的一篇期刊论文，探讨了时态知识和时态数据的统一模型。,9
8,	14(S),74-79,TYPE,这是论文在《软件学报》中发表的具体卷号、期号和页码范围，提供了文献引用的详细信息。,1
9,	2(003):10-12,DATE,这是论文在《工业工程》期刊上的卷期和页码，提供了论文的具体位置信息。,1
10,	周生明,PERSON,周生明是论文《基于描述逻辑的CIM模型》的作者之一。,1
11,	中国,LOCATION,中国是《计算机工程与应用》期刊的发行地，也是傅秀芬和汤庸的研究所在地。<SEP>中国是《软件学报》的发行国家，也是叶小平和汤庸进行研究的所在地。,2
12,	1998.,DATE,1998年是该论文发表的时间，标志着研究的时间点。,1
13,	冯智圣,PERSON,冯智圣是该论文的作者之一，参与了时态知识和时态数据的研究工作。,1
14,	1996年 第01期,DATE,这是论文发表的具体日期，表明研究的时间背景。,2
15,	时态变量‘NOW’语义及相应时态关系运算,PAPER,这是叶小平和汤庸发表的一篇研究论文，探讨了时态变量‘Now’的语义及其在时态关系运算中的应用。,9
16,	傅秀芬,PERSON,傅秀芬是该论文的作者之一，专注于基于WWW的交互式网络课件系统的技术开发。,1
17,	计算机时代,JOURNA

In [None]:
await rag.adelete_by_entity("汤庸")