In [2]:
import os
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache
from lightrag.llm import ollama_embedding
from lightrag.utils import EmbeddingFunc
from dotenv import load_dotenv
import nest_asyncio 
nest_asyncio.apply() 

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
WORKING_DIR = "../data/test_postgres"

if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

In [4]:
async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], **kwargs
) -> str:
    return await openai_complete_if_cache(
        "qwen-plus",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
        **kwargs,
    )

In [None]:
rag = LightRAG(
    working_dir=WORKING_DIR,
    workspace="test_postgres_storage",
    # kg="Neo4JStorage",
    llm_model_func=llm_model_func,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=512,
        func=lambda texts: ollama_embedding(
            texts, embed_model="viosay/conan-embedding-v1:latest", host="http://192.168.69.234:11343"
        ),
    ),
    chunk_token_size=500,
    log_level="DEBUG",
    vector_storage="PostgresVectorDBStorage"
)

# rag.vector_db_storage_cls.db = postgresql_db

INFO:lightrag:Logger initialized for working directory: ../data/test_postgres
DEBUG:lightrag:LightRAG init with param:
  working_dir = ../data/test_postgres,
  workspace = test_postgres_storage,
  kv_storage = JsonKVStorage,
  vector_storage = PostgresVectorDBStorage,
  graph_storage = NetworkXStorage,
  log_level = DEBUG,
  chunk_token_size = 500,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f1c00c5b740>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function llm_model_func at 0x7f1c00c5b380>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_m

INFO:lightrag:Tables created successfully
INFO:lightrag:Tables created successfully
INFO:lightrag:Tables created successfully


In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader

doc_path = "../data/paper/scholat_paper_ed/scholat_paper_ed_001.csv"

loader = CSVLoader(doc_path)
data = loader.load()

data = [d.page_content for d in data]
need_to_insert_data = data[:200]
need_to_insert_data

['title: Bitemporal Extension and Mapping of XML Data Model\nauthors: Na Tang; Yong Tang; MiaoMiao Cai;\nsource: 11th International Conference on Computer Supported Cooperative Work in Design, 2007. CSCWD 2007.\nsourceDetail: \ndate: 2007.05\ntype: 会议论文\nkeyword: \nsummary: ',
 'title: 基于时限的角色访问控制委托模型\nauthors: 道炜 汤庸 冀高峰 杨虹轶\nsource: 计算机科学,2008,35（3）:175-177\nsourceDetail: \ndate: 2008.03\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 基于描述逻辑的带属性依赖时序ER模型\nauthors: 蒋运承\xa0汤庸\xa0王驹\xa0冀高峰\nsource: 计算机研究与发展，2007，44（10）: 1765～1773\nsourceDetail: \ndate: 2007.10\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: ID-Based fair off-lin electronic cash system with multiple banks\nauthors: Chang-ji Wang, Yong Tang, Qing Li\nsource: Journal of Computer science and technology, May 2007, 22(3):487~493\nsourceDetail: \ndate: 2007.03\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 基于WWW的交互式网络课件系统的开发技术\nauthors: 傅秀芬，汤庸\nsource: 计算机工程与应用\nsourceDetail: \ndate: 1998.-\ntype: 期刊论文\nkeyword: \nsummary: ',
 'title: 时

In [7]:
print(rag.entities_vdb)
print(rag.relationships_vdb)
print(rag.chunks_vdb)

EntityStorage(namespace='entities', global_config={'working_dir': '../data/test_postgres', 'workspace': 'test_postgres_storage', 'kv_storage': 'JsonKVStorage', 'vector_storage': 'PostgresVectorDBStorage', 'graph_storage': 'NetworkXStorage', 'log_level': 'DEBUG', 'chunk_token_size': 500, 'chunk_overlap_token_size': 100, 'tiktoken_model_name': 'gpt-4o-mini', 'entity_extract_max_gleaning': 1, 'entity_summary_to_max_tokens': 500, 'node_embedding_algorithm': 'node2vec', 'node2vec_params': {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3}, 'embedding_func': {'embedding_dim': 1024, 'max_token_size': 512, 'func': <function <lambda> at 0x7f1c00c5b740>}, 'embedding_batch_num': 32, 'embedding_func_max_async': 16, 'llm_model_func': <function llm_model_func at 0x7f1c00c5b380>, 'llm_model_name': 'meta-llama/Llama-3.2-1B-Instruct', 'llm_model_max_token_size': 32768, 'llm_model_max_async': 16, 'llm_model_kwargs': {}, 'vector_db_storage_cls_kw

In [8]:
print(rag.vector_storage_cls)

<function PostgresStorageFactory.get_storage_class at 0x7f1c018dd800>


In [9]:
print(rag.entities_vdb.storage.engine.url)

postgresql+asyncpg://postgres:***@localhost:6024/test_postgres


In [10]:
# await rag.entities_vdb.storage.init_tables()

In [None]:
await rag.ainsert(need_to_insert_data)

### 可视化本地图为网页

In [None]:
import networkx as nx
from pyvis.network import Network
import random

# Load the GraphML file
G = nx.read_graphml("../data/test_paper/graph_chunk_entity_relation.graphml")

# Create a Pyvis network
net = Network(height="100vh", notebook=True)

# Convert NetworkX graph to Pyvis network
net.from_nx(G)

# Add colors to nodes
for node in net.nodes:
    node["color"] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

# Save and display the network
net.show("../data/test_paper/knowledge_graph.html")


### 可视化本地图到 neo4j

In [None]:
import os
import json
from lightrag.utils import xml_to_json
from neo4j import GraphDatabase

# Constants
BATCH_SIZE_NODES = 500
BATCH_SIZE_EDGES = 100

# Neo4j connection credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678"


def convert_xml_to_json(xml_path, output_path):
    """Converts XML file to JSON and saves the output."""
    if not os.path.exists(xml_path):
        print(f"Error: File not found - {xml_path}")
        return None

    json_data = xml_to_json(xml_path)
    if json_data:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"JSON file created: {output_path}")
        return json_data
    else:
        print("Failed to create JSON data")
        return None


def process_in_batches(tx, query, data, batch_size):
    """Process data in batches and execute the given query."""
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        tx.run(query, {"nodes": batch} if "nodes" in query else {"edges": batch})


def main():
    # Paths
    xml_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
    json_file = os.path.join(WORKING_DIR, "graph_data.json")

    # Convert XML to JSON
    json_data = convert_xml_to_json(xml_file, json_file)
    if json_data is None:
        return

    # Load nodes and edges
    nodes = json_data.get("nodes", [])
    edges = json_data.get("edges", [])

    # Neo4j queries
    create_nodes_query = """
    UNWIND $nodes AS node
    MERGE (e:Entity {id: node.id})
    SET e.entity_type = node.entity_type,
        e.description = node.description,
        e.source_id = node.source_id,
        e.displayName = node.id
    REMOVE e:Entity
    WITH e, node
    CALL apoc.create.addLabels(e, [node.entity_type]) YIELD node AS labeledNode
    RETURN count(*)
    """

    create_edges_query = """
    UNWIND $edges AS edge
    MATCH (source {id: edge.source})
    MATCH (target {id: edge.target})
    WITH source, target, edge,
         CASE
            WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
            WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
            WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
            WHEN edge.keywords CONTAINS 'located' THEN 'located'
            WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
           ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
         END AS relType
    CALL apoc.create.relationship(source, relType, {
      weight: edge.weight,
      description: edge.description,
      keywords: edge.keywords,
      source_id: edge.source_id
    }, target) YIELD rel
    RETURN count(*)
    """

    set_displayname_and_labels_query = """
    MATCH (n)
    SET n.displayName = n.id
    WITH n
    CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
    RETURN count(*)
    """

    # Create a Neo4j driver
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    try:
        # Execute queries in batches
        with driver.session() as session:
            # Insert nodes in batches
            session.execute_write(
                process_in_batches, create_nodes_query, nodes, BATCH_SIZE_NODES
            )

            # Insert edges in batches
            session.execute_write(
                process_in_batches, create_edges_query, edges, BATCH_SIZE_EDGES
            )

            # Set displayName and labels
            session.run(set_displayname_and_labels_query)

    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.close()


main()


## 问答

In [11]:
res = await rag.aquery(
        "汤庸发表过哪些论文？",
        param=QueryParam(mode="hybrid", only_need_context=True),
    )

print(len(res))
print(res)

INFO:httpx:HTTP Request: POST http://192.168.69.234:11343/api/embed "HTTP/1.1 200 OK"
DEBUG:lightrag:local query 从实体数据库中获取的信息 [{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x7f1bfb17f3b0>, 'id': 'ent-d88c33fefb6fb7e8335e1314238f3ea0', 'createtime': datetime.datetime(2024, 11, 19, 9, 57, 40, 637592), 'content': '1996年 第01期这是《计算机时代》发表汤庸论文的具体期数和年份。', 'workspace': 'default', 'entity_name': '1996年 第01期', 'updatetime': None, 'embedding': array([ 0.01010448,  0.01765656, -0.01016603, ...,  0.03873679,
       -0.02908394,  0.06536138], dtype=float32)}, {'_sa_instance_state': <sqlalchemy.orm.state.InstanceState object at 0x7f1bfb17c350>, 'id': 'ent-1aa5bfc7ff8ac28f0bdb47fec2952c53', 'createtime': datetime.datetime(2024, 11, 19, 9, 57, 40, 637592), 'content': 'YONG TANGYong Tang 是一位多产的研究者，在多个领域都有显著的贡献。他的研究范围广泛，涵盖了音频动态缓冲控制算法、XML 关键词搜索、XML 数据模型的时间扩展及映射、时间变量查询 XML 文档、协作学习环境中的活动本体模型、工作流时间性能评估、基于模糊时间工作流网的周转时间估计上限、基于粗糙集理论的遗传算法数据推理、基于身份的公平离线电子现金系统的多银行实现、智能太阳能监测系统、服务组合访问控制模型、自适应 

49518

-----Entities-----
```csv
id,	entity,	type,	description,	rank
1,	CSCWD 2006,EVENT,CSCWD 2006 是一个会议，其中发表了关于基于扩展FTWF-nets的工作流时间性能评估的论文。<SEP>CSCWD 2006 是一个会议，其中发表了关于基于模糊时间工作流网的周转时间估计上限的论文。<SEP>CSCWD 2006 是一个会议，该研究论文在此会议上发表，涉及合作工作流程的研究。<SEP>CSCWD 2006 是一个会议，该论文在此会议上发表，讨论了计算机支持的协同工作领域的进展。<SEP>CSCWD 2006是一个会议，在此会议上发表了论文'Temporal Role Hierarchies'。<SEP>CSCWD 2006是一个会议，该会议论文在此发表，专注于计算机支持的协同工作的发展和应用。,12
2,	RP-TREE,TYPE,RP-Tree是一种高效紧凑的高维点访问方法，旨在提高数据检索效率。,8
3,	科研项目管理系统的设计与实现,PAPER,《科研项目管理系统的设计与实现》是一篇由印鉴、曹王华、杨敏、胡菁合作撰写的期刊论文，探讨了科研项目管理系统的开发。,6
4,	中国中西医结合杂志,JOURNAL,《中国中西医结合杂志》是发表论文《数据挖掘技术在中医证治研究中的应用与展望》的期刊。,1
5,	NA TANG,PERSON,Na Tang 是一篇关于XML数据模型的时间扩展和映射的会议论文的作者之一。<SEP>Na Tang 是一篇关于将双时间 XML 数据模型映射到 XML 文档的会议论文的作者。<SEP>Na Tang 是论文《基于模糊时间工作流网的工作流分析》的作者之一。<SEP>Na Tang 是该研究论文的作者之一，参与了关于社区护理合作的群件可用性因素的研究。<SEP>Na Tang 是该论文的作者之一，为合作软件工程的研究做出了贡献。<SEP>Na Tang是论文'Temporal Role Hierarchies'的作者之一。,12
6,	SCI,AWARD,SCI (Science Citation Index) 是该论文被引用的索引，表明其学术价值。,6
7,	流程增量挖掘中的模型更新方法,PAPER,流程增量挖掘中的模型更新方法是一篇期