## Imports

In [1]:
# Add project root to Python path ONLY FOR NOTEBOOK!!!
import sys
from pathlib import Path

project_root = (
    Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
from neo4j import GraphDatabase
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from collections import defaultdict
from neo4j_graphrag.retrievers import QdrantNeo4jRetriever
import uuid
import os

from config.datasets import GraphComponents, Single

In [3]:
# Load environment variables
load_dotenv()

True

---

## Setting Up Environment Variables

In [4]:
# Get credentials from environment variables (derived from docker-compose config)
qdrant_port = os.getenv("QDRANT_HTTP_PORT", "6333")
qdrant_host = os.getenv("QDRANT_URL", "http://localhost")
qdrant_url = f"{qdrant_host}:{qdrant_port}"


# Parse NEO4J_AUTH (format: username/password)
neo4j_auth = os.getenv("NEO4J_AUTH", "neo4j/password")
neo4j_username, neo4j_password = neo4j_auth.split("/", 1)
neo4j_bolt_port = os.getenv("NEO4J_BOLT_PORT")
neo4j_url = os.getenv("NEO4J_URL")
neo4j_uri = f"{neo4j_url}:{neo4j_bolt_port}"
collection_name = os.getenv("QDRANT_COLLECTION_NAME")

# LLM:
llm_model = os.getenv("LLM_MODEL")
llm_api_key = os.getenv("LLM_API_KEY")

# Data and Other:
data_folder = os.getenv("RAW_DATA_FOLDER")

In [5]:
# Print out all envs:
print(f"QDRANT PORT: {qdrant_port}")
print(f"QDRANT HOST: {qdrant_host}")
print(f"QDRANT URL: {qdrant_url}")
print(f"Collection Name: {collection_name}")
print(f"NEO4J URI: {neo4j_uri}")
print(f"NEO4J USERNAME: {neo4j_username}")
print(f"NEO4J PASSWORD: {neo4j_password}")
print(f"LLM MODEL: {llm_model}")
print(f"Data Folder: {data_folder}")


QDRANT PORT: 6333
QDRANT HOST: http://localhost
QDRANT URL: http://localhost:6333
Collection Name: QdrantRagCollection
NEO4J URI: bolt://localhost:7687
NEO4J USERNAME: neo4j
NEO4J PASSWORD: password
LLM MODEL: gemini/gemini-2.5-flash
Data Folder: ./raw_data/


---

## 0.Initialization of Neo4j and Qdrant Clients

### Initiate Qdrant and Neo4j 

In [6]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

qdrant_client = QdrantClient(
    url=qdrant_url,
)

### Creating Qdrant Collection

* Creating neo4j and qdrant clients
* Creating qdrant collection


In [7]:
from rag_handler.qdrant_orchestrator import QdrantOrchestrator

qdrant_orchestrator = QdrantOrchestrator(
    qdrant_url=qdrant_url, collection_name=collection_name, qdrant_key=None
)

qdrant_orchestrator.create_collection()


Collection 'QdrantRagCollection' not found. Creating it now...
Collection 'QdrantRagCollection' created successfully.


# 1. Ingestion - read files

In [8]:
from ingestion.file_reader import FileReader
from ingestion.chunker_embedder import ChunkerEmbedder

### Reading Files

In [9]:
file_reader = FileReader(folder_path=data_folder)
files = file_reader.read_files()
chunk_embedder = ChunkerEmbedder(
    all_files=files,
    chunk_size=os.getenv("CHUNK_SIZE"),
    chunk_overlap=os.getenv("CHUNK_OVERLAP"),
)


# files = {
#     "pdf": ["/home/nir/projects/graph_rag/raw_data/Nir_Potasman_CV.pdf"],
#     "text": [
#         "/home/nir/projects/graph_rag/raw_data/raw_text.txt",
#         "/home/nir/projects/graph_rag/raw_data/short_story.txt",
#     ],
#     "markdown": ["/home/nir/projects/graph_rag/raw_data/raw_text.md"],
#     "image": [],
# }


### Chunking Files
#### For now - I will only chunk the PDF.

In [10]:
chunked_pdf = chunk_embedder.chunk_pdf()

2025-11-30 13:18:01,724 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-30 13:18:01,790 - INFO - Going to convert document batch...
2025-11-30 13:18:01,791 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-30 13:18:01,831 - INFO - Loading plugin 'docling_defaults'
2025-11-30 13:18:01,836 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-30 13:18:01,846 - INFO - Loading plugin 'docling_defaults'
2025-11-30 13:18:01,852 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-30 13:18:02,160 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-30 13:18:02,177 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-30 13:18:02,186 [RapidOCR] download_file.py:60: File exists and is valid: /home/nir/projects/graph_rag/.venv/lib/python3.13/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-30 13:

In [11]:
chunked_pdf[0]

{'file': 'Nir_Potasman_CV.pdf',
 'chunks': ['Results-driven  Data  Scientist  leveraging  advanced  analytics,  with  a  background  in biology,  medicine,  machine  learning,  and  statistical  modeling  to  drive  data-informed decision-making.  Skilled  in  extracting  actionable  insights  from  complex  datasets  and communicating findings to both technical  and  non-technical  stakeholders.  Proficient  in Linux,  C,  AWS,  Docker,  Python,  NoSQL,  SQL.  Passionate  about  solving  challenging problems and delivering innovative solutions that drive business growth and operational efficiency.',
  '- Machine Learning & AI\n- Statistical Analysis\n- Data Visualization\n- LLMs',
  '- Bachelor of Science in Biology & Life-Science Tel-Aviv University 2015-2018\n- Bachelor of Medical Sciences Tel-Aviv University 2020-2022\n- Doctor of Medicine\nTel-Aviv University 2020 - Pause',
  '2025 - Present - AI Engineer at Finubit',
  '- Architected and implemented a production-grade, multi-agen

In [12]:
len(chunked_pdf[0]["chunks"])
# The PDF has been chunked to 15 pieces.

15

## 2. Extracting Nodes and Relationships

In [13]:
from ingestion.orchestration import Orchestrator


In [14]:
orchestrator = Orchestrator(llm_model, llm_api_key)

In [15]:
nodes, relationships, chunk_node_mapping = orchestrator.extract_graph_components(
    chunked_pdf
)

[92m13:18:07 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
2025-11-30 13:18:07,260 - INFO - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
[92m13:18:11 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-30 13:18:11,722 - INFO - Wrapper: Completed Call, calling success_handler
[92m13:18:11 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
2025-11-30 13:18:11,739 - INFO - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
[92m13:18:26 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-30 13:18:26,881 - INFO - Wrapper: Completed Call, calling success_handler
[92m13:18:26 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
2025-11-30 13:18:26,886 - INFO - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini

---

## 3. Ingest nodes and relationships into Neo4j

In [16]:
from rag_handler.neo4j_orchestrator import Neo4jOrchestrator

neo4_orchestrator = Neo4jOrchestrator(
    neo4j_url=neo4j_uri, auth=(neo4j_username, neo4j_password)
)
neo4_orchestrator.ingest_to_neo4j(nodes, relationships, chunk_node_mapping)

2025-11-30 13:21:14,419 - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (e))} {position: line: 1, column: 1, offset: 0} for query: 'MATCH (c:Chunk {id: $chunk_id}), (e:Entity {id: $entity_id}) CREATE (c)-[:MENTIONS]->(e)'
2025-11-30 13:21:14,441 - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {ca

{'Data Scientist': '7dc03499-8d2b-493d-9c37-3bb1a257cc5b',
 'advanced analytics': 'eeac288f-b265-4e04-9c1f-b90ebadafa7d',
 'biology': '3cbd5c61-95be-45d3-8ffa-ba90a86acaf0',
 'medicine': 'b773c0ee-16cd-4a79-9ad2-f0356942ef65',
 'machine learning': '54acd8bb-fc3d-40ca-8d4d-1da3dff5e8c0',
 'statistical modeling': '45cc1b4d-8053-4a63-9057-8b44e84cb0ce',
 'data-informed decision-making': 'f9777f68-a443-4311-99fa-30a5e252c1c9',
 'extracting actionable insights': '2a8f8332-954f-4a77-896f-7599518d2529',
 'communicating findings': '8484e54e-4b38-4aec-b1c5-cbb847b6b400',
 'actionable insights': 'bc451311-d9a8-4525-bb70-9c59de7ca5f7',
 'complex datasets': '76741fe8-de9c-4b85-9862-56f753fdd409',
 'findings': 'ff740a44-5955-4254-8157-0be59a043ab4',
 'technical stakeholders': '5fea5ba1-ef22-414b-a46a-6f59d3b8f2fd',
 'non-technical stakeholders': 'ffd062a0-04e0-4b7c-94ec-6d8d5ac72a4c',
 'Linux': 'de0267cc-0f6b-40e6-b03d-3f2494526e34',
 'C': '65147d06-4294-4f5e-8e16-c1bcb053716b',
 'AWS': '39e1d6fe-e

---

## 4. Embed the data

In [17]:
embedded_data = chunk_embedder.embed_chunks(chunked_pdf)

[92m13:21:32 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-30 13:21:32,334 - INFO - Wrapper: Completed Call, calling success_handler


embedded_data is a dictionary INSIDE A LIST (since there may be multiple files) that looks like this:
```
{
    "source_file" : <file_name.pdf>,
    "chunks" : ['chunk_1', 'chunk_2', ...],
    "embeddings" : [embedding_1, embedding_2, ...]
}
```
* number of embeddings as the number of chunks.
* if Nir_CV.pdf has 15 chunks, embeddings will look like this:
```
[[-1.001, -3.512, ...], [0.23541, 1.2345, ...], ... , embedding_15]
```
---

ULTIMATELY, embedded_data looks like this:
```
[
    {
        "source_file" : <file_name.pdf>,
        "chunks" : ['chunk_1', 'chunk_2', ...],
        "embeddings" : [embedding_1, embedding_2, ...]
    },
    {
        "source_file" : <another_file.pdf>,
        "chunks" : ['chunk_1', 'chunk_2', ...],
        "embeddings" : [embedding_1, embedding_2, ...]
    },
    ...
]
```

## 5. Ingest to Qdrant

In [18]:
# ingesting embedded data to qdrant:
qdrant_orchestrator.ingest_to_qdrant(collection_name, embedded_data, chunk_node_mapping)

---
## Embedding Tester + Query Embedding

In [19]:
user_query: str = "What is Nir Potasman's Email Address?"


In [20]:
embedded_user_query = chunk_embedder.embedding_text(user_query)

[92m13:21:32 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-30 13:21:32,878 - INFO - Wrapper: Completed Call, calling success_handler


---
## Testing Retriever

In [21]:
from answering_agent import retriever_search

ModuleNotFoundError: No module named 'answering_agent'

In [None]:
retriever_result = retriever_search(
    neo4j_driver=neo4j_driver,
    qdrant_client=qdrant_client,
    collection_name=collection_name,
    embedded_query=embedded_user_query,
)

In [None]:
retriever_result

In [None]:
chunk_ids = [
    item.content.split("'id': '")[1].split("'")[0] for item in retriever_result.items
]


In [None]:
chunk_ids

In [None]:
def fetch_related_graph(neo4j_client, chunk_ids):
    query = """
    // Start from Chunks, find their Entities
    MATCH (c:Chunk)-[:MENTIONS]->(e:Entity)
    WHERE c.id IN $chunk_ids
    
    // Get entities and their 1-hop relationships
    OPTIONAL MATCH (e)-[r]-(related:Entity)
    RETURN e, r, related
    """
    with neo4j_client.session() as session:
        result = session.run(query, chunk_ids=chunk_ids)
        subgraph = []
        for record in result:
            if record["r"] and record["related"]:
                subgraph.append(
                    {
                        "entity": record["e"],
                        "relationship": record["r"],
                        "related_node": record["related"],
                    }
                )
        return subgraph

In [None]:
subgraph_answers = fetch_related_graph(neo4j_client=neo4j_driver, chunk_ids=entity_ids)

In [None]:
subgraph_answers

In [None]:
def format_graph_context(subgraph):
    nodes = set()
    edges = []

    for entry in subgraph:
        entity = entry["entity"]
        related = entry["related_node"]
        relationship = entry["relationship"]

        nodes.add(entity["name"])
        nodes.add(related["name"])

        edges.append(f"{entity['name']} {relationship['type']} {related['name']}")

    return {"nodes": list(nodes), "edges": edges}

In [None]:
graph_context = format_graph_context(subgraph=subgraph_answers)

In [None]:
graph_context

In [None]:
def extract_chunk_text_from_retriever_results(retriever_result):
    """Extract the actual text from chunks returned by retriever"""
    chunks = []
    for item in retriever_result.items:
        # Parse the chunk text from the Neo4j node properties
        content = item.content
        # Extract text property
        text_start = content.find("'text': '") + len("'text': '")
        text_end = content.find("'", text_start)
        chunk_text = content[text_start:text_end]
        
        # Also get the score for ranking
        score_start = content.find("score=") + len("score=")
        score_end = content.find(">", score_start)
        score = float(content[score_start:score_end])
        
        chunks.append({
            "text": chunk_text,
            "score": score
        })
    return chunks

In [None]:
chunk_texts = extract_chunk_text_from_retriever_results(retriever_result)


In [None]:
from litellm import completion


In [None]:
def graphRAG_run(chunk_texts, graph_context, user_query):
    """
    Combine semantic chunks with graph knowledge
    
    Args:
        chunk_texts: List of relevant text chunks from Qdrant
        graph_context: Dict with nodes and edges from Neo4j
        user_query: User's question
    """
    # Format relevant chunks
    chunks_str = "\n\n".join([
        f"Chunk {i+1} (relevance: {chunk['score']:.2f}):\n{chunk['text']}"
        for i, chunk in enumerate(chunk_texts)
    ])
    
    # Format graph context
    nodes_str = ", ".join(graph_context["nodes"])
    edges_str = "; ".join(graph_context["edges"])
    
    prompt = f"""
    You are an intelligent assistant. Use both the semantically relevant text chunks 
    and the knowledge graph to provide an accurate, comprehensive answer.
    
    === RELEVANT TEXT CHUNKS (from vector search) ===
    {chunks_str}
    
    === KNOWLEDGE GRAPH (entities and relationships) ===
    Nodes: {nodes_str}
    
    Edges: {edges_str}
    
    === USER QUESTION ===
    {user_query}
    
    === INSTRUCTIONS ===
    1. First, identify relevant information from the text chunks (direct quotes)
    2. Then, enrich your answer using the knowledge graph relationships
    3. Provide a comprehensive answer that combines both sources
    4. Cite specific chunks when using direct information
    """
    
    try:
        response = completion(
            model=llm_model,
            api_key=llm_api_key,
            messages=[
                {
                    "role": "system",
                    "content": "Provide a thorough answer using both the text chunks and knowledge graph.",
                },
                {"role": "user", "content": prompt},
            ],
        )
        return response.choices[0].message
    
    except Exception as e:
        return f"Error querying LLM: {str(e)}"

In [None]:
chunk_texts = extract_chunk_text_from_retriever_results(retriever_result)


In [None]:
final_answer = graphRAG_run(
    chunk_texts=chunk_texts,
    graph_context=graph_context, 
    user_query=user_query
)

In [None]:
final_answer