In [1]:
import os
import logging
import sys
import nest_asyncio
import json

In [2]:
cd ..

/home/tsunn/Workspace/iai-lab/sosci/codes/Graph-RAG


In [3]:
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings,
    Document, get_response_synthesizer,
    PropertyGraphIndex,
    StorageContext
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    DynamicLLMPathExtractor,
    SchemaLLMPathExtractor
)

from core.data.processing import process_jsonl_data
from embeddings.LocalEmbedding import LocalEmbedding
from llm.TogetherLLM import TogetherLLM
from llm.Mistral import Mistral

  from .autonotebook import tqdm as notebook_tqdm


INFO:datasets:PyTorch version 2.5.1 available.


In [4]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("llama_index").setLevel(logging.INFO)
nest_asyncio.apply()

In [5]:
data_path = "core/data/annotations.jsonl"
llm = Mistral()
splitter = SentenceSplitter()
embedder = LocalEmbedding()

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 26886.56it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 21399.51it/s]


Mistral LLM initialized.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Alibaba-NLP/gte-large-en-v1.5
Loaded model: Alibaba-NLP/gte-large-en-v1.5


In [6]:
try:
    chunks = process_jsonl_data(data_path, mode="document")
except Exception as e:
    print(f"Error processing data: {e}")
    sys.exit(1)

Loaded 74669 documents


Parsing nodes: 100%|██████████| 74669/74669 [00:15<00:00, 4969.91it/s]


Finish splitting 77319 chunks


Processing chunks: 100%|██████████| 77319/77319 [00:00<00:00, 243557.24it/s]

Processed 77319 chunks





In [7]:
_chunks = [chunks[0]] # for testing
len(_chunks)

1

In [8]:
# TOD): Reimplement chunking 
_chunks = [Document.from_dict(chunk) for chunk in _chunks]
tmp = []
for chunk in _chunks:
    temp = splitter._get_splits_by_fns(chunk.text)[0]
    _tmp = []
    res = ""
    for t in temp:
        if len(res) > 32:
            _tmp.append(Document(text=res))
            res = ""
        res += t
    if len(res) > 0:
        _tmp.append(Document(text=res))
    
    # Do batch embedding
    embs = embedder.get_text_embedding_batch(
        texts = [doc.text for doc in _tmp],
        show_progress = True
    )
    for i, doc in enumerate(_tmp):
        doc.embedding = embs[i]
    tmp.extend(_tmp)

_chunks = tmp

len(_chunks)

Batches: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]?it/s]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.47s/it]
Batches: 100%|██████████| 1/1 [00:12<00:00, 12.14s/it]
Generating embeddings: 100%|██████████| 3/3 [00:22<00:00,  7.62s/it]


3

In [9]:
_chunks

[Document(id_='66681ab6-3b48-4764-b200-7f491a6810ba', embedding=[-0.060546875, -0.498046875, -0.6953125, 0.474609375, 0.396484375, -0.12451171875, -0.64453125, 0.6875, -0.53515625, -0.59765625, 1.0, 0.047607421875, -0.58984375, -0.064453125, -1.0078125, 0.134765625, -0.65625, 0.1865234375, -0.359375, -0.796875, -0.1796875, -0.625, 0.80859375, -0.41796875, 0.66015625, -0.205078125, -0.1435546875, 0.67578125, -0.298828125, 0.88671875, 0.33984375, -1.09375, 1.734375, 0.2216796875, 0.1611328125, 0.640625, 0.2275390625, 1.8125, -0.609375, 0.9453125, -0.91015625, -0.302734375, 0.42578125, -0.0751953125, 0.12060546875, 0.146484375, -0.1806640625, -1.0859375, -0.73046875, 0.16796875, -2.140625, 0.8046875, -0.2578125, 0.58984375, 0.48046875, 0.2080078125, -0.64453125, -0.357421875, -1.1875, 0.109375, -0.58984375, 0.5, -0.423828125, -0.890625, -1.3125, -0.1298828125, -0.474609375, -0.255859375, -1.4296875, 0.53125, -0.78125, 0.5078125, -0.2578125, -1.140625, 1.5703125, -0.91796875, -0.37109375, 

In [10]:
kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=5,
    num_workers=8,
)

In [11]:
graph_store = Neo4jPropertyGraphStore(
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    url=os.environ["NEO4J_URI"],
    database="neo4j"
)

In [12]:
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [13]:
index = PropertyGraphIndex.from_documents(
    documents=[Document.from_dict(chunk) for chunk in _chunks],
    llm=llm,
    kg_extractor=kg_extractor,
    embed_model=embedder,
    storage_context=storage_context,
    include_embeddings=False,
    max_triplets_per_chunk=5,
    show_progress=True
)

Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 3100.00it/s]
Extracting paths from text: 100%|██████████| 3/3 [00:45<00:00, 15.15s/it]
Extracting implicit paths: 100%|██████████| 3/3 [00:00<00:00, 57719.78it/s]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.55s/it]?it/s]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it]
Batches: 100%|██████████| 1/1 [00:12<00:00, 12.61s/it]
Generating embeddings: 100%|██████████| 1/1 [00:24<00:00, 24.06s/it]
Batches: 100%|██████████| 1/1 [00:09<00:00,  9.45s/it]?it/s]
Batches: 100%|██████████| 1/1 [00:12<00:00, 12.82s/it]
Batches: 100%|██████████| 1/1 [00:08<00:00,  8.81s/it]
Batches: 100%|██████████| 1/1 [00:09<00:00,  9.94s/it]
Batches: 100%|██████████| 1/1 [00:08<00:00,  8.88s/it]
Batches: 100%|██████████| 1/1 [00:09<00:00,  9.36s/it]
Batches: 100%|██████████| 1/1 [00:09<00:00,  9.11s/it]
Batches: 100%|██████████| 1/1 [00:08<00:00,  8.95s/it]
Batches: 100%|██████████| 1/1 [00:09<00:00,  9.02s/it]
Batches: 100%|██████████| 1/1 [00:09<00:00,  9

In [14]:
query_engine = index.as_query_engine(
    llm = llm,
    include_text=False, response_mode="tree_summarize"
)

In [15]:
response = query_engine.query("How to treat my cat?")

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.85s/it]


In [16]:
response

Response(response="To add some good answers, it's important to consider that wetting your fingers can help calm your cat if they are agitated or stressed. It's also important to ensure that the water is clean and not too hot or cold. Additionally, it may be helpful to speak in a calm and soothing voice when interacting with your cat.", source_nodes=[NodeWithScore(node=TextNode(id_='a06f3ee9-3d34-45bd-9ccf-b59fa0abb332', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0efcb2ac-0d48-48d4-9bbe-70748a7c3f38', node_type=None, metadata={}, hash=None)}, metadata_template='{key}: {value}', metadata_separator='\n', text='To -> Add -> Some good answers', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.3419897591376636), NodeWithScore(node=TextNode(id_='970a6ccb-7b47-457f-b27c-7e6111a1bf55

In [21]:
query_engine = index.as_query_engine(
    llm = llm,
    # embed_model = embedder,
    include_text=False,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=3
)

In [22]:
response = query_engine.query(
    "Effort"
)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]


In [23]:
response

Response(response=' Based on the context information provided, it is not clear what is being discussed. There appears to be an instance where a group of people (They) are scratching something, and the speaker (I) wants them to stop. The speaker also mentions saying "no" in a stern voice. Additionally, there is a mention of wet and some good answers.\n\nTo answer your question, Effort is not explicitly related to the context information provided. Could you please provide more specific or clarifying information for me to assist you better?', source_nodes=[NodeWithScore(node=TextNode(id_='10ab834c-ff04-4b74-9d7f-de4a81149666', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='bfb829ea-5316-408e-9094-c367d176056a', node_type=None, metadata={}, hash=None)}, metadata_template='{key}: {value}', metadata_separator='\n', text="They -> Are scratching something -> I don't want them t

In [24]:
nodes = index.as_retriever(include_text=False)
res = nodes.retrieve("Cat")
for r in res:
    print(r.text)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s]

Wetting -> On -> Fingers
Water -> On -> Fingers
I -> Wet -> Fingers
Wetting -> My -> Fingers
I -> Say "no -> In a stern voice
They -> Are scratching something -> I don't want them to
Wet -> Some -> Good answers



