In [1]:
import os
import logging
import sys
import nest_asyncio
import json

In [2]:
cd ..

d:\Project\Graph-RAG


In [3]:
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings,
    Document, get_response_synthesizer,
    KnowledgeGraphIndex, PropertyGraphIndex,
    StorageContext
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.graph_stores.neo4j import Neo4jGraphStore


from core.data.processing import process_data
from embeddings.LocalEmbedding import LocalEmbedding
from llm.TogetherLLM import TogetherLLM

  from .autonotebook import tqdm as notebook_tqdm


INFO:datasets:PyTorch version 2.5.1+cu118 available.


In [4]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("llama_index").setLevel(logging.INFO)
nest_asyncio.apply()

In [5]:
data_path = "core/data/annotations.jsonl"

In [6]:
try:
    chunks = process_data(data_path)
except Exception as e:
    print(f"Error processing data: {e}")
    sys.exit(1)

Parsing nodes: 100%|██████████| 74669/74669 [00:16<00:00, 4502.95it/s]


Finish processing 77319 chunks


In [7]:
chunks = chunks[0:10]
print(len(chunks))

10


In [8]:
embedder = LocalEmbedding()

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Alibaba-NLP/gte-large-en-v1.5
Loaded model: Alibaba-NLP/gte-large-en-v1.5


In [9]:
llm = TogetherLLM()

Custom LLM initialized with model: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-128K


In [10]:
graph_store = Neo4jGraphStore(
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    url=os.environ["NEO4J_URI"]
)

INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT constraint_1ed05907 FOR (e:Entity) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: '\n                CREATE CONSTRAINT IF NOT EXISTS FOR (n:Entity) REQUIRE n.id IS UNIQUE;\n                '


In [11]:
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [13]:
index = KnowledgeGraphIndex(
    nodes=chunks,
    llm=llm,
    embed_model=embedder,
    storage_context=storage_context,
    include_embeddings=True,
    max_triplets_per_chunk=3,
    show_progress=True
)

  index = KnowledgeGraphIndex(
Processing nodes:   0%|          | 0/10 [00:00<?, ?it/s]
[A
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.07it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 71.43it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 76.92it/s]
Generating embeddings: 100%|██████████| 3/3 [00:00<00:00, 10.35it/s]
Processing nodes:  10%|█         | 1/10 [00:01<00:16,  1.84s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.88it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 83.11it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 62.48it/s]
Generating embeddings: 100%|██████████| 3/3 [00:00<00:00, 49.09it/s]
Processing nodes:  20%|██        | 2/10 [00:02<00:09,  1.22s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 65.67it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 66.29it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 67.02it/s]
Generating embeddings: 100%|██████████| 3/3 [00:00<00:00, 50.37it/s]
Processing nodes:  30%|███       | 3/10 [00:03<00:07,  1.07s/it]
Batch

In [15]:
query_engine = index.as_query_engine(
    llm = llm,
    include_text=False, response_mode="tree_summarize"
)

In [16]:
response = query_engine.query("Tell me more about Interleaf")

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.87it/s]


In [18]:
response

Response(response='Unfortunately, there is no information provided about "Interleaf" in the given context. The provided knowledge sequence only contains two examples:\n\n1. (\'Cats\', \'Understand\', \'Something\')\n2. (\'Way\', \'Works for\', \'Us\')\n\nThere is no mention of "Interleaf" in the kg_schema or the knowledge sequence. Therefore, I cannot provide any information about "Interleaf" based on the given context. If you have more information or context about "Interleaf", I would be happy to try and help you further.', source_nodes=[NodeWithScore(node=TextNode(id_='3d4a9967-6172-4f7e-b251-32ab0936a52b', embedding=None, metadata={'kg_rel_texts': ["('Cats', 'Understand', 'Something')", "('Way', 'Works for', 'Us')"], 'kg_rel_map': {}, 'kg_schema': {'schema': "Node properties are the following:\nEntity {id: STRING}\nRelationship properties are the following:\n\nThe relationships are the following:\n(:Entity)-[:WET]->(:Entity),(:Entity)-[:FLICK]->(:Entity),(:Entity)-[:USE]->(:Entity),

In [17]:
query_engine = index.as_query_engine(
    llm = llm,
    embed_model = embedder,
    include_text=False,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5
)

In [None]:
response = query_engine.query(
    "Effort"
)

In [None]:
response

In [None]:
nodes = response.source_nodes

# Iterate through the nodes to access their embeddings and calculate the length
for i, node in enumerate(nodes):
    if node.node.embedding is not None:
        embedding_length = len(node.node.embedding)
        print(f"Node {i + 1}: Embedding Length = {embedding_length}")
    else:
        print(f"Node {i + 1}: Embedding is None")