In [1]:
import os
import logging
import sys
import nest_asyncio
import json

In [2]:
cd ..

/home/tsunn/Workspace/iai-lab/sosci/codes/Graph-RAG


In [3]:
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings,
    Document, get_response_synthesizer,
    PropertyGraphIndex,
    StorageContext
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.graph_stores.neo4j import Neo4jGraphStore


from core.data.processing import process_jsonl_data
from embeddings.LocalEmbedding import LocalEmbedding
from llm.TogetherLLM import TogetherLLM

  from .autonotebook import tqdm as notebook_tqdm


INFO:datasets:PyTorch version 2.5.1 available.


In [4]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("llama_index").setLevel(logging.INFO)
nest_asyncio.apply()

In [5]:
stopwords = set([
    "the", "a", "an", "and", "or", "but", "as", "of", "at", "by", "for", "with", "about", "to", "into", "onto", "upon"
])

In [6]:
data_path = "core/data/annotations.jsonl"

In [7]:
try:
    chunks = process_jsonl_data(data_path, mode="document")
except Exception as e:
    print(f"Error processing data: {e}")
    sys.exit(1)

Loaded 74669 documents


Parsing nodes: 100%|██████████| 74669/74669 [00:13<00:00, 5420.98it/s]


Finish splitting 77319 chunks


Processing chunks: 100%|██████████| 77319/77319 [00:03<00:00, 21471.11it/s]

Processed 77319 chunks





In [None]:
embedder = LocalEmbedding()

In [8]:
# llm = TogetherLLM()

In [None]:
graph_store = Neo4jGraphStore(
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    url=os.environ["NEO4J_URI"]
)

In [10]:
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [None]:
# index = PropertyGraphIndex(
#     nodes=chunks,
#     llm=llm,
#     embed_model=embedder,
#     storage_context=storage_context,
#     include_embeddings=True,
#     max_triplets_per_chunk=10,
#     show_progress=True
# )
index = PropertyGraphIndex.from_documents(
    documents=chunks,
    llm=llm,
    embed_model=embedder,
    storage_context=storage_context,
    include_embeddings=True,
    max_triplets_per_chunk=10,
    show_progress=True
)

In [13]:
query_engine = index.as_query_engine(
    llm = llm,
    include_text=False, response_mode="tree_summarize"
)

In [None]:
response = query_engine.query("Tell me more about Interleaf")

In [None]:
response

In [None]:
query_engine = index.as_query_engine(
    llm = llm,
    # embed_model = embedder,
    include_text=False,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5
)

In [None]:
response = query_engine.query(
    "Effort"
)

In [None]:
response

In [None]:
nodes = response.source_nodes

# Iterate through the nodes to access their embeddings and calculate the length
for i, node in enumerate(nodes):
    if node.node.embedding is not None:
        
        embedding_length = len(node.node.embedding)
        print(f"Node {i + 1}: Embedding Length = {embedding_length}")
    else:
        print(f"Node {i + 1}: Embedding is None")