In [1]:
import os
import logging
import sys
import nest_asyncio
import json

In [2]:
cd ..

/home/tsunn/Workspace/iai-lab/sosci/codes/Graph-RAG


In [4]:
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings,
    Document, get_response_synthesizer,
    PropertyGraphIndex,
    StorageContext
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    DynamicLLMPathExtractor,
    SchemaLLMPathExtractor
)

from core.data.processing import process_jsonl_data
from embeddings.LocalEmbedding import LocalEmbedding
from llm.TogetherLLM import TogetherLLM
from llm.Mistral import Mistral

  from .autonotebook import tqdm as notebook_tqdm


INFO:datasets:PyTorch version 2.5.1 available.


In [5]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("llama_index").setLevel(logging.INFO)
nest_asyncio.apply()

In [6]:
data_path = "core/data/annotations.jsonl"
llm = Mistral()
splitter = SentenceSplitter()
embedder = LocalEmbedding()

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 30615.36it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 18893.26it/s]


Mistral LLM initialized.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Alibaba-NLP/gte-large-en-v1.5
Loaded model: Alibaba-NLP/gte-large-en-v1.5


In [None]:
try:
    chunks = process_jsonl_data(data_path, mode="document")
except Exception as e:
    print(f"Error processing data: {e}")
    sys.exit(1)

In [None]:
_chunks = [chunks[0]] # for testing
len(_chunks)

In [None]:
# TOD): Reimplement chunking 
_chunks = [Document.from_dict(chunk) for chunk in _chunks]
tmp = []
for chunk in _chunks:
    temp = splitter._get_splits_by_fns(chunk.text)[0]
    _tmp = []
    res = ""
    for t in temp:
        if len(res) > 32:
            _tmp.append(Document(text=res))
            res = ""
        res += t
    if len(res) > 0:
        _tmp.append(Document(text=res))
    
    # Do batch embedding
    embs = embedder.get_text_embedding_batch(
        texts = [doc.text for doc in _tmp],
        show_progress = True
    )
    for i, doc in enumerate(_tmp):
        doc.embedding = embs[i]
    tmp.extend(_tmp)

_chunks = tmp

len(_chunks)

In [None]:
_chunks

In [10]:
kg_extractor = ImplicitPathExtractor()

In [None]:
graph_store = Neo4jGraphStore(
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    url=os.environ["NEO4J_URI"]
)

In [13]:
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [None]:
index = PropertyGraphIndex.from_documents(
    documents=[Document.from_dict(chunk) for chunk in _chunks],
    llm=llm,
    kg_extractor=kg_extractor,
    embed_model=embedder,
    storage_context=storage_context,
    include_embeddings=False,
    max_triplets_per_chunk=5,
    show_progress=True
)

In [15]:
query_engine = index.as_query_engine(
    llm = llm,
    include_text=False, response_mode="tree_summarize"
)

In [None]:
response = query_engine.query("How to treat my cat?")

In [None]:
response

In [None]:
query_engine = index.as_query_engine(
    llm = llm,
    # embed_model = embedder,
    include_text=False,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5
)

In [None]:
response = query_engine.query(
    "Effort"
)

In [None]:
response

In [None]:
nodes = index.as_retriever(include_text=False)
res = nodes.retrieve("Cat")
for r in res:
    print(r.text)