## Imports

In [1]:
# Add project root to Python path ONLY FOR NOTEBOOK!!!
import sys
from pathlib import Path

project_root = (
    Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
from neo4j import GraphDatabase
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from collections import defaultdict
from neo4j_graphrag.retrievers import QdrantNeo4jRetriever
import uuid
import os

from config.datasets import GraphComponents, Single

In [3]:
# Load environment variables
load_dotenv()

True

---

## Setting Up Environment Variables

In [4]:
# Get credentials from environment variables (derived from docker-compose config)
qdrant_port = os.getenv("QDRANT_HTTP_PORT", "6333")
qdrant_host = os.getenv("QDRANT_URL", "http://localhost")
qdrant_url = f"{qdrant_host}:{qdrant_port}"


# Parse NEO4J_AUTH (format: username/password)
neo4j_auth = os.getenv("NEO4J_AUTH", "neo4j/password")
neo4j_username, neo4j_password = neo4j_auth.split("/", 1)
neo4j_bolt_port = os.getenv("NEO4J_BOLT_PORT")
neo4j_url = os.getenv("NEO4J_URL")
neo4j_uri = f"{neo4j_url}:{neo4j_bolt_port}"
# LLM:
llm_model = os.getenv("LLM_MODEL")
llm_api_key = os.getenv("LLM_API_KEY")

# Data and Other:
data_folder = os.getenv("RAW_DATA_FOLDER")

In [5]:
# Print out all envs:
print(qdrant_port)
print(qdrant_host)
print(qdrant_url)
print(neo4j_uri)
print(neo4j_username)
print(neo4j_password)
print(llm_model)
print(data_folder)


6333
http://localhost
http://localhost:6333
bolt://localhost:7687
neo4j
password
perplexity/sonar-pro
./raw_data/


---

## Initialization of Neo4j and Qdrant Clients

In [6]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

qdrant_client = QdrantClient(
    url=qdrant_url,
)

---
## TEST - Graph Creation

In [7]:
from ingestion.orchestration import Orchestrator

orchestrator = Orchestrator(llm_model, llm_api_key)
orchestrator._test_response("What is the capital of France?")

'The capital of France is **Paris**[1][2][3][5][6].\n\nParis serves as the national seat of government, culture, and commerce, located in the north-central part of the country along the Seine River[1][6]. Its position as the capital dates back to medieval times, first established under King Clovis I in 508 CE, and has remained so with only brief interruptions during periods of war or political turmoil[3][4][5]. Today, Paris is not only the political capital but also the largest city in France, with a population of over 2 million residents within city limits and a metropolitan area exceeding 10 million[2][6]. It is renowned globally for its historical landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum, affirming its status as a major world city[1][5].'

In [8]:
testing_graph = orchestrator.llm_parser(
    "Zachary has 3 children: Refael (male), his sister Noah, and the youngest boy Beerih."
)

In [9]:
testing_graph

GraphComponents(graph=[Single(node='Zachary', relationship='parent of', target_node='Refael'), Single(node='Zachary', relationship='parent of', target_node='Noah'), Single(node='Zachary', relationship='parent of', target_node='Beerih'), Single(node='Refael', relationship='sibling of', target_node='Noah'), Single(node='Refael', relationship='sibling of', target_node='Beerih'), Single(node='Noah', relationship='sibling of', target_node='Refael'), Single(node='Noah', relationship='sibling of', target_node='Beerih'), Single(node='Beerih', relationship='sibling of', target_node='Refael'), Single(node='Beerih', relationship='sibling of', target_node='Noah'), Single(node='Noah', relationship='is sister of', target_node='Refael'), Single(node='Refael', relationship='is brother of', target_node='Noah'), Single(node='Beerih', relationship='is youngest brother of', target_node='Refael'), Single(node='Beerih', relationship='is youngest brother of', target_node='Noah')])

In [10]:
testing_graph.graph

[Single(node='Zachary', relationship='parent of', target_node='Refael'),
 Single(node='Zachary', relationship='parent of', target_node='Noah'),
 Single(node='Zachary', relationship='parent of', target_node='Beerih'),
 Single(node='Refael', relationship='sibling of', target_node='Noah'),
 Single(node='Refael', relationship='sibling of', target_node='Beerih'),
 Single(node='Noah', relationship='sibling of', target_node='Refael'),
 Single(node='Noah', relationship='sibling of', target_node='Beerih'),
 Single(node='Beerih', relationship='sibling of', target_node='Refael'),
 Single(node='Beerih', relationship='sibling of', target_node='Noah'),
 Single(node='Noah', relationship='is sister of', target_node='Refael'),
 Single(node='Refael', relationship='is brother of', target_node='Noah'),
 Single(node='Beerih', relationship='is youngest brother of', target_node='Refael'),
 Single(node='Beerih', relationship='is youngest brother of', target_node='Noah')]

In [11]:
for entry in testing_graph.graph:
    node = entry.node
    target_node = entry.target_node  # Get target node if available
    relationship = entry.relationship  # Get relationship if available
    print(f"NODE: {node}, TARGET NODE: {target_node}, RELATIONSHIP: {relationship}")

NODE: Zachary, TARGET NODE: Refael, RELATIONSHIP: parent of
NODE: Zachary, TARGET NODE: Noah, RELATIONSHIP: parent of
NODE: Zachary, TARGET NODE: Beerih, RELATIONSHIP: parent of
NODE: Refael, TARGET NODE: Noah, RELATIONSHIP: sibling of
NODE: Refael, TARGET NODE: Beerih, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Refael, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Beerih, RELATIONSHIP: sibling of
NODE: Beerih, TARGET NODE: Refael, RELATIONSHIP: sibling of
NODE: Beerih, TARGET NODE: Noah, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Refael, RELATIONSHIP: is sister of
NODE: Refael, TARGET NODE: Noah, RELATIONSHIP: is brother of
NODE: Beerih, TARGET NODE: Refael, RELATIONSHIP: is youngest brother of
NODE: Beerih, TARGET NODE: Noah, RELATIONSHIP: is youngest brother of


---

## TEST - graph extraction

In [12]:
from ingestion.file_reader import FileReader
from ingestion.chunker_embedder import ChunkerEmbedder

In [13]:
file_reader = FileReader(folder_path=data_folder)
files = file_reader.read_files()
# print(f"files: {files}")
chunk_embedder = ChunkerEmbedder(
    all_files=files,
    chunk_size=os.getenv("CHUNK_SIZE"),
    chunk_overlap=os.getenv("CHUNK_OVERLAP"),
)


In [14]:
for chunk in chunk_embedder.chunk_text():
    print(f"file name: {chunk['file']}")
    print(f"chunks: {chunk['chunks']}")
    print(f"amount of chunks: {len(chunk['chunks'])}")

file name: raw_text.txt
chunks: ["The glob module in Python is used to find file paths that match a specified pattern using Unix shell-style wildcards.\n The primary function, glob.glob(), returns a list of path names matching a given pattern.\n For example, glob.glob('*.txt') will return all files in the current directory ending with .txt.", "To search recursively through subdirectories, the recursive parameter must be set to True, and the pattern ** is used to match any files and zero or more directories and subdirectories.\n For instance, glob.glob('**/*.py', recursive=True) will find all Python files in the current directory and all its subdirectories.\n The ** pattern can match across multiple directory levels, unlike *, which only matches files at the same directory level.", 'An alternative to glob.glob() is glob.iglob(), which returns an iterator instead of a list, making it more memory-efficient for large searches.\n This is particularly useful when dealing with a large number 

In [15]:
from ingestion.orchestration import Orchestrator


In [16]:
orchestrator = Orchestrator(llm_model, llm_api_key)

In [17]:
# text_chunks = chunk_embedder.chunk_text()

In [18]:
# nodes, relationships = orchestrator.extract_graph_components(text_chunks)


In [19]:
# print("TEXT CHUNKS NODES AND RELATIONSHIPS")
# print(f"nodes: {nodes}")
# print("=" * 100)
# print(f"relationships: {relationships}")




In [20]:
pdf_chunks = chunk_embedder.chunk_pdf()
nodes, relationships = orchestrator.extract_graph_components(pdf_chunks)

2025-11-24 13:50:03,259 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-24 13:50:03,270 - INFO - Going to convert document batch...
2025-11-24 13:50:03,271 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-24 13:50:03,279 - INFO - Loading plugin 'docling_defaults'
2025-11-24 13:50:03,282 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-24 13:50:03,289 - INFO - Loading plugin 'docling_defaults'
2025-11-24 13:50:03,293 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-24 13:50:03,954 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-24 13:50:03,971 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-24 13:50:03,978 [RapidOCR] download_file.py:60: File exists and is valid: /home/nir/projects/graph_rag/.venv/lib/python3.13/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-24 13:

In [21]:
print("PDF CHUNKS NODES AND RELATIONSHIPS")
print(f"nodes: {nodes}")
print("=" * 100)
print(f"relationships: {relationships}")

PDF CHUNKS NODES AND RELATIONSHIPS
nodes: {'Data Scientist': 'c192436d-38c5-45a7-968c-d4b2c0cb2514', 'advanced analytics': '29cc6153-92af-4d9d-9b91-62952526d9ff', 'biology': '08e1c245-55ba-47a6-a169-e4535276cba6', 'medicine': '816b75d9-bc9d-48e6-bbdd-e3753ff1286c', 'machine learning': '4eb204f6-2331-4bc4-8fa9-80833c3dbe6b', 'statistical modeling': '97fbfa09-cba5-4426-9bd7-9bf7f1b20a59', 'data-informed decision-making': '104536fa-f5b0-46a0-9c44-8d575934ac22', 'extracting actionable insights': '23e2d238-f526-48f1-b1f8-32b831532b22', 'complex datasets': 'd4650e61-74d6-4f2d-8ec6-6f241fcf2f84', 'technical stakeholders': '8ba7adad-179d-44f3-9bf0-22947adfe496', 'non-technical stakeholders': 'a06d7b9d-46cf-4971-b7ca-92ca9f7900db', 'Linux': '50199455-1705-4f4f-ad28-720d5430d105', 'C': 'ab3c84d6-25ec-4419-a522-bc02463e9329', 'AWS': '9987cc9a-2679-485a-be6b-f50e087d2ba9', 'Docker': '72e41cf4-0a4d-425d-b6b8-2a5c8ae564b1', 'Python': '35dc956d-5eed-4a1f-afa9-cc6b67de0503', 'NoSQL': '4ee5ed47-8043-45

---

# Embed the data

In [22]:
embedded_data = chunk_embedder.embed_chunks(pdf_chunks)


[92m13:51:59 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-24 13:51:59,494 - INFO - Wrapper: Completed Call, calling success_handler


---

## Ingest to Neo4J

In [23]:
from rag_handler.neo4j_orchestrator import Neo4jOrchestrator

neo4_orchestrator = Neo4jOrchestrator(
    neo4j_url=neo4j_uri, auth=(neo4j_username, neo4j_password)
)
neo4_orchestrator.ingest_to_neo4j(nodes, relationships)

2025-11-24 13:52:01,329 - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 1, column: 1, offset: 0} for query: 'MATCH (a:Entity {id: $source_id}), (b:Entity {id: $target_id}) CREATE (a)-[:leverages {original_type: $type, source_file: $source_file}]->(b)'
2025-11-24 13:52:01,368 - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: N

{'Data Scientist': 'c192436d-38c5-45a7-968c-d4b2c0cb2514',
 'advanced analytics': '29cc6153-92af-4d9d-9b91-62952526d9ff',
 'biology': '08e1c245-55ba-47a6-a169-e4535276cba6',
 'medicine': '816b75d9-bc9d-48e6-bbdd-e3753ff1286c',
 'machine learning': '4eb204f6-2331-4bc4-8fa9-80833c3dbe6b',
 'statistical modeling': '97fbfa09-cba5-4426-9bd7-9bf7f1b20a59',
 'data-informed decision-making': '104536fa-f5b0-46a0-9c44-8d575934ac22',
 'extracting actionable insights': '23e2d238-f526-48f1-b1f8-32b831532b22',
 'complex datasets': 'd4650e61-74d6-4f2d-8ec6-6f241fcf2f84',
 'technical stakeholders': '8ba7adad-179d-44f3-9bf0-22947adfe496',
 'non-technical stakeholders': 'a06d7b9d-46cf-4971-b7ca-92ca9f7900db',
 'Linux': '50199455-1705-4f4f-ad28-720d5430d105',
 'C': 'ab3c84d6-25ec-4419-a522-bc02463e9329',
 'AWS': '9987cc9a-2679-485a-be6b-f50e087d2ba9',
 'Docker': '72e41cf4-0a4d-425d-b6b8-2a5c8ae564b1',
 'Python': '35dc956d-5eed-4a1f-afa9-cc6b67de0503',
 'NoSQL': '4ee5ed47-8043-455e-ae7d-79203e6efb71',
 'S

---

## Ingest to Qdrant

In [24]:
from rag_handler.qdrant_orchestrator import QdrantOrchestrator

qdrant_key = None
collection_name = "QdrantRagCollection"

In [25]:
qdrant_orchestrator = QdrantOrchestrator(qdrant_url=qdrant_url, qdrant_key=qdrant_key)

In [26]:
qdrant_orchestrator.create_collection()

Skipping creating collection; 'QdrantRagCollection' already exists.


In [27]:
# ingesting embedded data to qdrant:
qdrant_orchestrator.ingest_to_qdrant(
    collection_name=collection_name, embedded_data=embedded_data
)

---
## Embedding Tester + Query Embedding

In [28]:
user_query: str = "who is Nir Potasman?"


In [29]:
embedded_user_query = chunk_embedder.embedding_text(user_query)

[92m13:52:08 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-24 13:52:08,947 - INFO - Wrapper: Completed Call, calling success_handler


In [30]:
embedded_user_query = chunk_embedder.embedding_text(user_query)


[92m13:52:09 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-24 13:52:09,259 - INFO - Wrapper: Completed Call, calling success_handler


---
## Testing Retriever

In [31]:
from answering_agent import retriever_search

In [32]:
retriever_result = retriever_search(
    neo4j_driver=neo4j_driver,
    qdrant_client=qdrant_client,
    collection_name=collection_name,
    embedded_query=embedded_user_query,
)

In [36]:
retriever_result

RetrieverResult(items=[], metadata={'__retriever': 'QdrantNeo4jRetriever'})

In [34]:
entity_ids = [item.content.split("'id': '")[1].split("'")[0] for item in retriever_result.items]


In [35]:
entity_ids

[]