## Imports

In [1]:
# Add project root to Python path ONLY FOR NOTEBOOK!!!
import sys
from pathlib import Path

project_root = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [1]:
from neo4j import GraphDatabase
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from collections import defaultdict
from neo4j_graphrag.retrievers import QdrantNeo4jRetriever
import uuid
import os

from config.datasets import GraphComponents, Single

In [2]:
# Load environment variables
load_dotenv()

True

---

## Setting Up Environment Variables

In [3]:
# Get credentials from environment variables (derived from docker-compose config)
qdrant_port = os.getenv("QDRANT_HTTP_PORT", "6333")
qdrant_host = os.getenv("QDRANT_URL", "http://localhost")
qdrant_url = f"{qdrant_host}:{qdrant_port}"

neo4j_bolt_port = os.getenv("NEO4J_BOLT_PORT", "7687")
neo4j_uri = f"bolt://localhost:{neo4j_bolt_port}"

# Parse NEO4J_AUTH (format: username/password)
neo4j_auth = os.getenv("NEO4J_AUTH", "neo4j/password")
neo4j_username, neo4j_password = neo4j_auth.split("/", 1)

# LLM:
llm_model = os.getenv("LLM_MODEL")
llm_api_key = os.getenv("LLM_API_KEY")

# Data and Other:
data_folder = os.getenv("RAW_DATA_FOLDER")

In [4]:
# Print out all envs:
print(qdrant_port)
print(qdrant_host)
print(qdrant_url)
print(neo4j_uri)
print(neo4j_username)
print(neo4j_password)
print(llm_model)
print(data_folder)


6333
http://localhost
http://localhost:6333
bolt://localhost:7687
neo4j
password
perplexity/sonar-pro
./raw_data/


---

## Initialization of Neo4j and Qdrant Clients

In [6]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

qdrant_client = QdrantClient(
    url=qdrant_url,
)

---
## TEST - Graph Creation

In [7]:
from ingestion.orchestration import Orchestrator

orchestrator = Orchestrator(llm_model, llm_api_key)
orchestrator._test_response("What is the capital of France?")

'The capital of France is **Paris**[1][3][6].\n\nParis is located in the north-central part of the country along the Seine River. It is not only the administrative and political center but also the largest city in France, widely recognized as a global center for art, fashion, culture, and history[1][3][6]. The city\'s status as the capital has shifted temporarily during certain historical periods (e.g., wartime), but Paris has been the official capital continuously since 1944 and for most of French history since the late 10th century[5][6].\n\nMajor landmarks in Paris include the **Eiffel Tower**, **Notre-Dame Cathedral**, and the **Louvre Museum**[6]. Paris is often referred to as the "City of Light" due to its historical significance as a center of education and ideas, as well as its early adoption of street lighting[2][6].'

In [8]:
testing_graph =orchestrator.llm_parser(
    "Zachary has 3 children: Refael (male), his sister Noah, and the youngest boy Beerih."
)

In [9]:
testing_graph

GraphComponents(graph=[Single(node='Zachary', relationship='parent of', target_node='Refael'), Single(node='Zachary', relationship='parent of', target_node='Noah'), Single(node='Zachary', relationship='parent of', target_node='Beerih'), Single(node='Refael', relationship='sibling of', target_node='Noah'), Single(node='Refael', relationship='sibling of', target_node='Beerih'), Single(node='Noah', relationship='sibling of', target_node='Refael'), Single(node='Noah', relationship='sibling of', target_node='Beerih'), Single(node='Beerih', relationship='sibling of', target_node='Refael'), Single(node='Beerih', relationship='sibling of', target_node='Noah'), Single(node='Noah', relationship='has gender', target_node='female'), Single(node='Refael', relationship='has gender', target_node='male'), Single(node='Beerih', relationship='has gender', target_node='male'), Single(node='Beerih', relationship='is youngest child of', target_node='Zachary')])

In [10]:
testing_graph.graph

[Single(node='Zachary', relationship='parent of', target_node='Refael'),
 Single(node='Zachary', relationship='parent of', target_node='Noah'),
 Single(node='Zachary', relationship='parent of', target_node='Beerih'),
 Single(node='Refael', relationship='sibling of', target_node='Noah'),
 Single(node='Refael', relationship='sibling of', target_node='Beerih'),
 Single(node='Noah', relationship='sibling of', target_node='Refael'),
 Single(node='Noah', relationship='sibling of', target_node='Beerih'),
 Single(node='Beerih', relationship='sibling of', target_node='Refael'),
 Single(node='Beerih', relationship='sibling of', target_node='Noah'),
 Single(node='Noah', relationship='has gender', target_node='female'),
 Single(node='Refael', relationship='has gender', target_node='male'),
 Single(node='Beerih', relationship='has gender', target_node='male'),
 Single(node='Beerih', relationship='is youngest child of', target_node='Zachary')]

In [11]:
for entry in testing_graph.graph:
        node = entry.node
        target_node = entry.target_node  # Get target node if available
        relationship = entry.relationship  # Get relationship if available
        print(f"NODE: {node}, TARGET NODE: {target_node}, RELATIONSHIP: {relationship}")

NODE: Zachary, TARGET NODE: Refael, RELATIONSHIP: parent of
NODE: Zachary, TARGET NODE: Noah, RELATIONSHIP: parent of
NODE: Zachary, TARGET NODE: Beerih, RELATIONSHIP: parent of
NODE: Refael, TARGET NODE: Noah, RELATIONSHIP: sibling of
NODE: Refael, TARGET NODE: Beerih, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Refael, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Beerih, RELATIONSHIP: sibling of
NODE: Beerih, TARGET NODE: Refael, RELATIONSHIP: sibling of
NODE: Beerih, TARGET NODE: Noah, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: female, RELATIONSHIP: has gender
NODE: Refael, TARGET NODE: male, RELATIONSHIP: has gender
NODE: Beerih, TARGET NODE: male, RELATIONSHIP: has gender
NODE: Beerih, TARGET NODE: Zachary, RELATIONSHIP: is youngest child of


---

## TEST - graph extraction

In [6]:
from ingestion.file_reader import FileReader
from ingestion.chunker_embedder import ChunkerEmbedder

In [7]:
file_reader = FileReader(folder_path=data_folder)
files = file_reader.read_files()
# print(f"files: {files}")
chunk_embedder = ChunkerEmbedder(
    all_files=files,
    chunk_size=os.getenv("CHUNK_SIZE"),
    chunk_overlap=os.getenv("CHUNK_OVERLAP"),
    )


In [8]:
for chunk in chunk_embedder.chunk_text():
    print(f"file name: {chunk['file']}")
    print(f"chunks: {chunk['chunks']}")
    print(f"amount of chunks: {len(chunk['chunks'])}")

file name: raw_text.txt
chunks: ["The glob module in Python is used to find file paths that match a specified pattern using Unix shell-style wildcards.\n The primary function, glob.glob(), returns a list of path names matching a given pattern.\n For example, glob.glob('*.txt') will return all files in the current directory ending with .txt.", "To search recursively through subdirectories, the recursive parameter must be set to True, and the pattern ** is used to match any files and zero or more directories and subdirectories.\n For instance, glob.glob('**/*.py', recursive=True) will find all Python files in the current directory and all its subdirectories.\n The ** pattern can match across multiple directory levels, unlike *, which only matches files at the same directory level.", 'An alternative to glob.glob() is glob.iglob(), which returns an iterator instead of a list, making it more memory-efficient for large searches.\n This is particularly useful when dealing with a large number 

In [9]:
from ingestion.orchestration import Orchestrator


In [10]:
orchestrator = Orchestrator(llm_model, llm_api_key)

In [11]:
text_chunks = chunk_embedder.chunk_text()

In [12]:
nodes, relationships = orchestrator.extract_graph_components(text_chunks)


[92m14:13:37 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= sonar-pro; provider = perplexity
2025-11-18 14:13:37,269 - INFO - 
LiteLLM completion() model= sonar-pro; provider = perplexity
[92m14:13:42 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-18 14:13:42,074 - INFO - Wrapper: Completed Call, calling success_handler
[92m14:13:42 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= sonar-pro; provider = perplexity
2025-11-18 14:13:42,082 - INFO - 
LiteLLM completion() model= sonar-pro; provider = perplexity
[92m14:13:47 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-18 14:13:47,328 - INFO - Wrapper: Completed Call, calling success_handler
[92m14:13:47 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= sonar-pro; provider = perplexity
2025-11-18 14:13:47,331 - INFO - 
LiteLLM completion() model= sonar-pro; provider = perplexity
[92m14:13:50 - L

In [None]:
print("TEXT CHUNKS NODES AND RELATIONSHIPS")
print(f"nodes: {nodes}")
print("=" * 100)
print(f"relationships: {relationships}")




nodes: {'glob module': 'ffb5159f-49cc-456f-b7dc-14dd8c9738ed', 'find file paths that match a specified pattern': '31d88013-fa8f-4117-bcb9-ad843dcade0c', 'Unix shell-style wildcards': 'e18ed1c8-7702-4c77-81fc-9c3f8b37a9f1', 'glob.glob()': '66932647-a48b-4d7b-b754-086d59b44cff', 'list of path names matching a given pattern': 'eadabd06-a6eb-44c6-b69b-dc75bf3088ef', "glob.glob('*.txt')": 'ed90e538-5ce5-4c77-84bb-7d6e5d6010b5', 'all files in the current directory ending with .txt': 'd9bd3bee-5e1b-4235-af75-d48b4cd8430f', 'Python': '23efa91e-16f1-4044-a65b-9f938b6800e3', 'recursive parameter': '913970b0-90ee-463d-9fdd-90ed0440892d', 'True': 'ed3efa7f-75b1-456f-a248-08f105ffd323', 'search recursively through subdirectories': '474cec6d-73f2-44b4-be2f-79260d6f9ee1', '** pattern': '91dd1303-68cc-4d06-b03f-f45b59c4683e', 'match any files and zero or more directories and subdirectories': '71369574-1679-4abd-b910-8516fafa01ec', "glob.glob('**/*.py', recursive=True)": 'a778629d-9290-4dd6-a4a1-36d941

In [14]:
pdf_chunks = chunk_embedder.chunk_pdf()
nodes, relationships = orchestrator.extract_graph_components(pdf_chunks)

2025-11-18 14:16:18,485 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-18 14:16:18,531 - INFO - Going to convert document batch...
2025-11-18 14:16:18,532 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-18 14:16:18,544 - INFO - Loading plugin 'docling_defaults'
2025-11-18 14:16:18,547 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-18 14:16:18,554 - INFO - Loading plugin 'docling_defaults'
2025-11-18 14:16:18,560 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-18 14:16:19,351 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-18 14:16:19,368 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-18 14:16:19,378 [RapidOCR] download_file.py:60: File exists and is valid: /home/nir/projects/graph_rag/.venv/lib/python3.13/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-18 14:

In [15]:
print("PDF CHUNKS NODES AND RELATIONSHIPS")
print(f"nodes: {nodes}")
print("=" * 100)
print(f"relationships: {relationships}")

PDF CHUNKS NODES AND RELATIONSHIPS
nodes: {'Machine Learning': '629c2312-a0bd-4e84-92e5-fdf14f666d26', 'Data Visualization': 'f4bde00b-964d-46bf-84ff-a61b9d098597', 'AI': '4cffa6c4-a697-4ee2-b276-820437f807dd', 'Pattern Identification': '036c1a5f-486f-4510-9ae8-f662d95c56df', 'Statistical Analysis': '06abd511-ed02-49e9-82c9-267ce35f236b', 'Insights': '0a5fdabf-6be7-4af7-91a1-c6359d03cad2', 'LLMs': 'd64adc84-a6af-4d54-b00e-a67e1c8c2653', 'entity and relationship extraction': '356634f1-22f3-4871-bb35-21bd1b2fdfd4', 'Knowledge Graphs': '2f0551f9-33ef-4872-96d5-2267b843ecd7', 'Dash Library': '3eed3216-b113-4baa-bd0b-c3ce735583d9', 'AI/ML': 'b4a5b2d2-8640-4bd9-bb17-d04c9f6f7e76', 'Dash Library Applications': 'd36dd9f3-c8ba-40f2-b4a9-e66d97be968c', 'Person': 'edfd3b79-fef7-4e5c-9571-9cc91e1b2b4b', 'Bachelor of Science in Biology & Life-Science': 'c9b88d1e-b83f-4de9-8c44-20482a7630c0', 'Tel-Aviv University': 'a4e42029-3230-4ae6-b29c-49d733535490', '2015-2018': '684ad1cf-2768-4ba7-89c6-5e66cf0