In [1]:
%pip install -q langchain langchain_community

Note: you may need to restart the kernel to use updated packages.


In [19]:
import os
import asyncio
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache, openai_embedding
from lightrag.llm import ollama_model_complete, ollama_embedding
from lightrag.utils import EmbeddingFunc
import numpy as np
import textract
from dotenv import load_dotenv
import nest_asyncio 
nest_asyncio.apply() 

load_dotenv()

True

In [20]:
WORKING_DIR = "../data/test_paper"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

In [21]:
async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        "qwen-plus",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
        **kwargs,
    )

In [22]:
rag = LightRAG(
    working_dir=WORKING_DIR,
    # kg="Neo4JStorage",
    llm_model_func=llm_model_func,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=512,
        func=lambda texts: ollama_embedding(
            texts, embed_model="viosay/conan-embedding-v1:latest", host="http://192.168.69.234:11343"
        ),
    ),
    chunk_token_size=500,
    log_level="DEBUG",
)

INFO:lightrag:Logger initialized for working directory: ../data/test_paper
DEBUG:lightrag:LightRAG init with param:
  working_dir = ../data/test_paper,
  kg = NetworkXStorage,
  log_level = DEBUG,
  chunk_token_size = 500,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f9378aa3420>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function llm_model_func at 0x7f9378aa34c0>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_model_max_async = 16,
  llm_model_kwargs = {},
  key_string_value_json_storage_cls = <class 'lightrag.storage.JsonKVStorage'>,
  

In [18]:
from langchain_community.document_loaders.csv_loader import CSVLoader

doc_path = "../data/paper/scholat_paper_ed/scholat_paper_ed_001.csv"

loader = CSVLoader(doc_path)
data = loader.load()

data = [d.page_content for d in data]
data[4:9]

['title: 基于WWW的交互式网络课件系统的开发技术\nauthors: 傅秀芬，汤庸\nsource: 计算机工程与应用\nsourceDetail: \ndate: 1998.-\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时态变量“Now”语义及相应时态关系运算\nauthors: 叶小平，汤庸\nsource: 软件学报，2005，16（5）：838-845\nsourceDetail: \ndate: 2005.05\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 计算机支持的协同工作概观\nauthors: 汤庸\nsource: 工业工程,1999,2(003):10-12\nsourceDetail: \ndate: 1999.01\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时态知识和时态数据的统一模型研究\nauthors: 汤庸，汤娜，叶小平，冯智圣，肖炜\nsource: 软件学报，2003，14(S),74-79 【EI】\nsourceDetail: \ndate: 2003.11\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 基于描述逻辑的CIM模型\nauthors: 蒋运承 汤庸 王驹 周生明\nsource: 微电子学与计算机,2007,24(012):55-58\nsourceDetail: \ndate: 2007.-\ntype: 期刊论文\nkeyword: \nsummary: ']

In [23]:
await rag.ainsert(data[4:9])

INFO:lightrag:[New Docs] inserting 5 docs
INFO:lightrag:[New Chunks] inserting 5 chunks
INFO:lightrag:Inserting 5 vectors to chunks
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
INFO:lightrag:[Entity Extraction]...
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs

⠙ Processed 1 chunks, 8 entities(duplicated), 7 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠹ Processed 2 chunks, 18 entities(duplicated), 16 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠸ Processed 3 chunks, 27 entities(duplicated), 25 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠼ Processed 4 chunks, 33 entities(duplicated), 33 relations(duplicated)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


⠴ Processed 5 chunks, 42 entities(duplicated), 41 relations(duplicated)

DEBUG:lightrag:Node "傅秀芬" is new to graph.
DEBUG:lightrag:Original description tokens: 23
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for "傅秀芬": "傅秀芬 is one of the authors of a paper on the development of interactive web-based courseware systems."
DEBUG:lightrag:Upserted node "傅秀芬" into knowledge graph.
DEBUG:lightrag:Node "汤庸" already exists in graph.
DEBUG:lightrag:Original description tokens: 211
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for "汤庸": "Tang Yong is one of the authors of the paper '时态知识和时态数据的统一模型研究'."<SEP>"The author of the paper 'An Overview of Computer-Supported Collaborative Work', published in 1999."<SEP>"汤庸 is one of the authors of a paper on the development of interactive web-based courseware systems."<SEP>"汤庸 is one of the authors of the paper '基于描述逻辑的CIM模型', published in 微电子学与计算机 in 2007."<SEP>"汤庸 is one of the authors of the paper '基于描述逻辑的带属性依赖时序ER模型',




DEBUG:lightrag:Upserted edge from "10-12" to "计算机支持的协同工作概观" into knowledge graph.
DEBUG:lightrag:Edge from "1999.01" to "计算机支持的协同工作概观" is new to graph.
DEBUG:lightrag:Original description tokens: 23
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for edge "1999.01"->"计算机支持的协同工作概观": "The paper 'An Overview of Computer-Supported Collaborative Work' was published on the date 1999.01."
DEBUG:lightrag:Upserted edge from "1999.01" to "计算机支持的协同工作概观" into knowledge graph.
DEBUG:lightrag:Edge from "期刊论文" to "计算机支持的协同工作概观" is new to graph.
DEBUG:lightrag:Original description tokens: 19
DEBUG:lightrag:Description is short enough, no need to summarize.
DEBUG:lightrag:Final description for edge "期刊论文"->"计算机支持的协同工作概观": "The paper 'An Overview of Computer-Supported Collaborative Work' is classified as a journal paper."
DEBUG:lightrag:Upserted edge from "期刊论文" to "计算机支持的协同工作概观" into knowledge graph.
DEBUG:lightrag:Edge from "时态知识和时态数据的统一模型研究" to "汤庸" 

### 可视化本地图为网页

In [25]:
import networkx as nx
from pyvis.network import Network
import random

# Load the GraphML file
G = nx.read_graphml("../data/test_paper/graph_chunk_entity_relation.graphml")

# Create a Pyvis network
net = Network(height="100vh", notebook=True)

# Convert NetworkX graph to Pyvis network
net.from_nx(G)

# Add colors to nodes
for node in net.nodes:
    node["color"] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

# Save and display the network
net.show("../data/test_paper/knowledge_graph.html")


../data/test_paper/knowledge_graph.html


### 可视化本地图到 neo4j

In [26]:
import os
import json
from lightrag.utils import xml_to_json
from neo4j import GraphDatabase

# Constants
BATCH_SIZE_NODES = 500
BATCH_SIZE_EDGES = 100

# Neo4j connection credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678"


def convert_xml_to_json(xml_path, output_path):
    """Converts XML file to JSON and saves the output."""
    if not os.path.exists(xml_path):
        print(f"Error: File not found - {xml_path}")
        return None

    json_data = xml_to_json(xml_path)
    if json_data:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"JSON file created: {output_path}")
        return json_data
    else:
        print("Failed to create JSON data")
        return None


def process_in_batches(tx, query, data, batch_size):
    """Process data in batches and execute the given query."""
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        tx.run(query, {"nodes": batch} if "nodes" in query else {"edges": batch})


def main():
    # Paths
    xml_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
    json_file = os.path.join(WORKING_DIR, "graph_data.json")

    # Convert XML to JSON
    json_data = convert_xml_to_json(xml_file, json_file)
    if json_data is None:
        return

    # Load nodes and edges
    nodes = json_data.get("nodes", [])
    edges = json_data.get("edges", [])

    # Neo4j queries
    create_nodes_query = """
    UNWIND $nodes AS node
    MERGE (e:Entity {id: node.id})
    SET e.entity_type = node.entity_type,
        e.description = node.description,
        e.source_id = node.source_id,
        e.displayName = node.id
    REMOVE e:Entity
    WITH e, node
    CALL apoc.create.addLabels(e, [node.entity_type]) YIELD node AS labeledNode
    RETURN count(*)
    """

    create_edges_query = """
    UNWIND $edges AS edge
    MATCH (source {id: edge.source})
    MATCH (target {id: edge.target})
    WITH source, target, edge,
         CASE
            WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
            WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
            WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
            WHEN edge.keywords CONTAINS 'located' THEN 'located'
            WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
           ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
         END AS relType
    CALL apoc.create.relationship(source, relType, {
      weight: edge.weight,
      description: edge.description,
      keywords: edge.keywords,
      source_id: edge.source_id
    }, target) YIELD rel
    RETURN count(*)
    """

    set_displayname_and_labels_query = """
    MATCH (n)
    SET n.displayName = n.id
    WITH n
    CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
    RETURN count(*)
    """

    # Create a Neo4j driver
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    try:
        # Execute queries in batches
        with driver.session() as session:
            # Insert nodes in batches
            session.execute_write(
                process_in_batches, create_nodes_query, nodes, BATCH_SIZE_NODES
            )

            # Insert edges in batches
            session.execute_write(
                process_in_batches, create_edges_query, edges, BATCH_SIZE_EDGES
            )

            # Set displayName and labels
            session.run(set_displayname_and_labels_query)

    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.close()


main()


Root element: {http://graphml.graphdrawing.org/xmlns}graphml
Root attributes: {'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'}
Found 48 nodes and 58 edges
JSON file created: ../data/test_paper/graph_data.json


In [27]:
print(
    await rag.aquery(
        "汤庸发表过哪些论文？",
        param=QueryParam(mode="hybrid"),
    )
)

INFO:httpx:HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
DEBUG:lightrag:local query 从实体数据库中获取的信息 [{'__id__': 'ent-07064580d380246514eec30e9aa69df8', 'entity_name': '"汤庸"', '__metrics__': 0.6949922203642276, 'id': 'ent-07064580d380246514eec30e9aa69df8', 'distance': 0.6949922203642276}, {'__id__': 'ent-2bbfea4f1f8d06fbc7004847db5d5744', 'entity_name': '"王驹"', '__metrics__': 0.562706965841677, 'id': 'ent-2bbfea4f1f8d06fbc7004847db5d5744', 'distance': 0.562706965841677}, {'__id__': 'ent-cf179b5ea5ce2e7222e9bee097e16cce', 'entity_name': '"基于描述逻辑的CIM模型"', '__metrics__': 0.5604659369693408, 'id': 'ent-cf179b5ea5ce2e7222e9bee097e16cce', 'distance': 0.5604659369693408}, {'__id__': 'ent-7e0a59edda8d2cdfb7a2fb5c44dbf716', 'entity_name': '"软件学报"', '__metrics__': 0.5593587167353271, 'id': 'ent-7e0a59edda8d2cdfb7a2fb5c44dbf716', 'distance': 0.559358716735327

汤庸是一位活跃的计算机科学领域的研究人员，发表了多篇具有影响力的论文。以下是汤庸的主要论文列表：

### 1. **计算机支持的协同工作概观**
- **标题**: 计算机支持的协同工作概观
- **作者**: 汤庸
- **来源**: 工业工程, 1999, 2(003): 10-12
- **日期**: 1999.01
- **类型**: 期刊论文

### 2. **基于描述逻辑的CIM模型**
- **标题**: 基于描述逻辑的CIM模型
- **作者**: 蒋运承, 汤庸, 王驹, 周生明
- **来源**: 微电子学与计算机, 2007, 24(012): 55-58
- **日期**: 2007.-
- **类型**: 期刊论文

### 3. **基于描述逻辑的带属性依赖时序ER模型**
- **标题**: 基于描述逻辑的带属性依赖时序ER模型
- **作者**: 蒋运承, 汤庸, 王驹, 冀高峰
- **来源**: 计算机研究与发展, 2007, 44(10): 1765-1773
- **日期**: 2007.10
- **类型**: 期刊论文

### 4. **时态知识和时态数据的统一模型研究**
- **标题**: 时态知识和时态数据的统一模型研究
- **作者**: 汤庸, 汤娜, 叶小平, 冯智圣, 肖炜
- **来源**: 软件学报, 2003, 14(S), 74-79 【EI】
- **日期**: 2003.11
- **类型**: 期刊论文

### 5. **基于时限的角色访问控制委托模型**
- **标题**: 基于时限的角色访问控制委托模型
- **作者**: 道炜, 汤庸, 冀高峰, 杨虹轶
- **来源**: 计算机科学, 2008, 35(3): 175-177
- **日期**: 2008.03
- **类型**: 期刊论文

### 6. **时态变量“Now”语义及相应时态关系运算**
- **标题**: 时态变量“Now”语义及相应时态关系运算
- **作者**: 叶小平, 汤庸
- **来源**: 软件学报, 2005, 16(5): 838-845
- **日期**: 2005.05
- **类型**: 期刊论文

### 7. **基于WWW的交互式网络课件系统的开发技术**
- **标题**: 基于