## Imports

In [1]:
# Add project root to Python path ONLY FOR NOTEBOOK!!!
import sys
from pathlib import Path

project_root = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
from neo4j import GraphDatabase
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from collections import defaultdict
from neo4j_graphrag.retrievers import QdrantNeo4jRetriever
import uuid
import os

from config.datasets import GraphComponents, Single

In [3]:
# Load environment variables
load_dotenv()

True

---

## Setting Up Environment Variables

In [4]:
# Get credentials from environment variables (derived from docker-compose config)
qdrant_port = os.getenv("QDRANT_HTTP_PORT", "6333")
qdrant_host = os.getenv("QDRANT_URL", "http://localhost")
qdrant_url = f"{qdrant_host}:{qdrant_port}"



# Parse NEO4J_AUTH (format: username/password)
neo4j_auth = os.getenv("NEO4J_AUTH", "neo4j/password")
neo4j_username, neo4j_password = neo4j_auth.split("/", 1)
neo4j_bolt_port = os.getenv("NEO4J_BOLT_PORT")
neo4j_url = os.getenv('NEO4J_URL')
neo4j_uri = f"{neo4j_url}:{neo4j_bolt_port}"
# LLM:
llm_model = os.getenv("LLM_MODEL")
llm_api_key = os.getenv("LLM_API_KEY")

# Data and Other:
data_folder = os.getenv("RAW_DATA_FOLDER")

In [5]:
# Print out all envs:
print(qdrant_port)
print(qdrant_host)
print(qdrant_url)
print(neo4j_uri)
print(neo4j_username)
print(neo4j_password)
print(llm_model)
print(data_folder)


6333
http://localhost
http://localhost:6333
bolt://localhost:7687
neo4j
password
perplexity/sonar-pro
./raw_data/


---

## Initialization of Neo4j and Qdrant Clients

In [6]:
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

qdrant_client = QdrantClient(
    url=qdrant_url,
)

---
## TEST - Graph Creation

In [7]:
from ingestion.orchestration import Orchestrator

orchestrator = Orchestrator(llm_model, llm_api_key)
orchestrator._test_response("What is the capital of France?")

"The capital of France is **Paris**[1][2][7].\n\nParis is the country's largest city and its most important center for commerce, culture, and government, located in the north-central part of France along the Seine River[1]. Throughout history, Paris has served as the capital for most of France’s existence, except for brief periods during times of war or political upheaval[4][5]. Today, Paris remains the official seat of the French national government and the primary hub for French political and cultural life[2][7]."

In [8]:
testing_graph =orchestrator.llm_parser(
    "Zachary has 3 children: Refael (male), his sister Noah, and the youngest boy Beerih."
)

In [9]:
testing_graph

GraphComponents(graph=[Single(node='Zachary', relationship='parent of', target_node='Refael'), Single(node='Zachary', relationship='parent of', target_node='Noah'), Single(node='Zachary', relationship='parent of', target_node='Beerih'), Single(node='Refael', relationship='sibling of', target_node='Noah'), Single(node='Refael', relationship='sibling of', target_node='Beerih'), Single(node='Noah', relationship='sibling of', target_node='Refael'), Single(node='Noah', relationship='sibling of', target_node='Beerih'), Single(node='Beerih', relationship='sibling of', target_node='Refael'), Single(node='Beerih', relationship='sibling of', target_node='Noah'), Single(node='Noah', relationship='sister of', target_node='Refael'), Single(node='Beerih', relationship='youngest sibling of', target_node='Refael'), Single(node='Beerih', relationship='youngest sibling of', target_node='Noah'), Single(node='Refael', relationship='male', target_node=''), Single(node='Noah', relationship='female', target_

In [10]:
testing_graph.graph

[Single(node='Zachary', relationship='parent of', target_node='Refael'),
 Single(node='Zachary', relationship='parent of', target_node='Noah'),
 Single(node='Zachary', relationship='parent of', target_node='Beerih'),
 Single(node='Refael', relationship='sibling of', target_node='Noah'),
 Single(node='Refael', relationship='sibling of', target_node='Beerih'),
 Single(node='Noah', relationship='sibling of', target_node='Refael'),
 Single(node='Noah', relationship='sibling of', target_node='Beerih'),
 Single(node='Beerih', relationship='sibling of', target_node='Refael'),
 Single(node='Beerih', relationship='sibling of', target_node='Noah'),
 Single(node='Noah', relationship='sister of', target_node='Refael'),
 Single(node='Beerih', relationship='youngest sibling of', target_node='Refael'),
 Single(node='Beerih', relationship='youngest sibling of', target_node='Noah'),
 Single(node='Refael', relationship='male', target_node=''),
 Single(node='Noah', relationship='female', target_node=''),

In [11]:
for entry in testing_graph.graph:
        node = entry.node
        target_node = entry.target_node  # Get target node if available
        relationship = entry.relationship  # Get relationship if available
        print(f"NODE: {node}, TARGET NODE: {target_node}, RELATIONSHIP: {relationship}")

NODE: Zachary, TARGET NODE: Refael, RELATIONSHIP: parent of
NODE: Zachary, TARGET NODE: Noah, RELATIONSHIP: parent of
NODE: Zachary, TARGET NODE: Beerih, RELATIONSHIP: parent of
NODE: Refael, TARGET NODE: Noah, RELATIONSHIP: sibling of
NODE: Refael, TARGET NODE: Beerih, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Refael, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Beerih, RELATIONSHIP: sibling of
NODE: Beerih, TARGET NODE: Refael, RELATIONSHIP: sibling of
NODE: Beerih, TARGET NODE: Noah, RELATIONSHIP: sibling of
NODE: Noah, TARGET NODE: Refael, RELATIONSHIP: sister of
NODE: Beerih, TARGET NODE: Refael, RELATIONSHIP: youngest sibling of
NODE: Beerih, TARGET NODE: Noah, RELATIONSHIP: youngest sibling of
NODE: Refael, TARGET NODE: , RELATIONSHIP: male
NODE: Noah, TARGET NODE: , RELATIONSHIP: female
NODE: Beerih, TARGET NODE: , RELATIONSHIP: male


---

## TEST - graph extraction

In [12]:
from ingestion.file_reader import FileReader
from ingestion.chunker_embedder import ChunkerEmbedder

In [13]:
file_reader = FileReader(folder_path=data_folder)
files = file_reader.read_files()
# print(f"files: {files}")
chunk_embedder = ChunkerEmbedder(
    all_files=files,
    chunk_size=os.getenv("CHUNK_SIZE"),
    chunk_overlap=os.getenv("CHUNK_OVERLAP"),
    )


In [14]:
for chunk in chunk_embedder.chunk_text():
    print(f"file name: {chunk['file']}")
    print(f"chunks: {chunk['chunks']}")
    print(f"amount of chunks: {len(chunk['chunks'])}")

file name: raw_text.txt
chunks: ["The glob module in Python is used to find file paths that match a specified pattern using Unix shell-style wildcards.\n The primary function, glob.glob(), returns a list of path names matching a given pattern.\n For example, glob.glob('*.txt') will return all files in the current directory ending with .txt.", "To search recursively through subdirectories, the recursive parameter must be set to True, and the pattern ** is used to match any files and zero or more directories and subdirectories.\n For instance, glob.glob('**/*.py', recursive=True) will find all Python files in the current directory and all its subdirectories.\n The ** pattern can match across multiple directory levels, unlike *, which only matches files at the same directory level.", 'An alternative to glob.glob() is glob.iglob(), which returns an iterator instead of a list, making it more memory-efficient for large searches.\n This is particularly useful when dealing with a large number 

In [15]:
from ingestion.orchestration import Orchestrator


In [16]:
orchestrator = Orchestrator(llm_model, llm_api_key)

In [None]:
# text_chunks = chunk_embedder.chunk_text()

In [None]:
# nodes, relationships = orchestrator.extract_graph_components(text_chunks)


[92m11:18:57 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= sonar-pro; provider = perplexity
2025-11-23 11:18:57,707 - INFO - 
LiteLLM completion() model= sonar-pro; provider = perplexity
[92m11:19:00 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-23 11:19:00,863 - INFO - Wrapper: Completed Call, calling success_handler
[92m11:19:00 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= sonar-pro; provider = perplexity
2025-11-23 11:19:00,878 - INFO - 
LiteLLM completion() model= sonar-pro; provider = perplexity
[92m11:19:07 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-23 11:19:07,064 - INFO - Wrapper: Completed Call, calling success_handler
[92m11:19:07 - LiteLLM:INFO[0m: utils.py:3427 - 
LiteLLM completion() model= sonar-pro; provider = perplexity
2025-11-23 11:19:07,068 - INFO - 
LiteLLM completion() model= sonar-pro; provider = perplexity
[92m11:19:10 - L

In [None]:
# print("TEXT CHUNKS NODES AND RELATIONSHIPS")
# print(f"nodes: {nodes}")
# print("=" * 100)
# print(f"relationships: {relationships}")




TEXT CHUNKS NODES AND RELATIONSHIPS
nodes: {'glob module': 'bc9818d5-0fec-4669-af84-59a1df817517', 'file paths matching a specified pattern': '8c0b2124-5da2-49a0-9ea2-232f2f596ae7', 'Unix shell-style wildcards': '39a58f40-fd58-42ca-9293-adfa29f17f57', 'glob.glob() function': '0097508e-d31c-4769-9c6b-798c2834c480', 'list of path names matching a given pattern': '0dedc794-d19e-4ec8-b6ce-e6e3af90ea0c', "glob.glob('*.txt')": 'b9d73689-c0c4-40e1-b22d-921ec36c7b1c', 'all files in current directory ending with .txt': '9953d830-0128-423a-b4a9-8646b0a331b9', 'Python programming': 'fec61f5b-d675-4f6a-a284-b45806308dbc', 'pattern matching': '053c8f62-909d-4256-b08a-54cdffbf9d1d', 'recursive parameter': '225673ea-8632-4a91-9d6d-708527d01965', 'True': 'ff67c12b-61c8-4e39-b1b4-2d09192471fb', 'recursive search through subdirectories': 'a17ad1f2-551e-4795-b0ba-41d929305df7', "pattern '**'": 'f6aab59b-0bd6-4f6a-ab15-b926f3e56ceb', 'any files': 'ead13329-8e9d-4c18-bd8f-341682611dfa', 'zero or more direc

In [27]:
pdf_chunks = chunk_embedder.chunk_pdf()
nodes, relationships = orchestrator.extract_graph_components(pdf_chunks)

2025-11-23 11:21:39,982 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-23 11:21:40,042 - INFO - Going to convert document batch...
2025-11-23 11:21:40,043 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-23 11:21:40,068 - INFO - Loading plugin 'docling_defaults'
2025-11-23 11:21:40,071 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-23 11:21:40,085 - INFO - Loading plugin 'docling_defaults'
2025-11-23 11:21:40,092 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-23 11:21:40,872 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-23 11:21:40,890 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-23 11:21:40,900 [RapidOCR] download_file.py:60: File exists and is valid: /home/nir/projects/graph_rag/.venv/lib/python3.13/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-23 11:

In [28]:
print("PDF CHUNKS NODES AND RELATIONSHIPS")
print(f"nodes: {nodes}")
print("=" * 100)
print(f"relationships: {relationships}")

PDF CHUNKS NODES AND RELATIONSHIPS
nodes: {'Data Scientist': 'fb08bc87-01ca-4aaf-8d1f-6ea5e59d3270', 'advanced analytics': 'e932ac40-ceb5-4765-b94d-66d4c467aee2', 'biology': '9c15037a-0da8-4f1e-bdcc-43a4cd34fce6', 'medicine': '672cf5f5-62ee-4723-82e5-825fbd88e8a2', 'machine learning': '0262f53e-d204-4c04-9616-ba2824f9f958', 'statistical modeling': '4ffb001c-d757-476c-a2cc-662b1b3cee41', 'data-informed decision-making': '05240bc7-ad1e-493d-a863-2885c89059c9', 'actionable insights': '13a166c0-0505-4b20-97de-d373224cca8f', 'complex datasets': 'd9cd03bc-d563-440a-a7f8-60876e8c3dbf', 'technical stakeholders': '0de968a2-d255-41a0-9342-dab0ce538d9d', 'non-technical stakeholders': '02a1f2f0-bf6d-4db6-bd97-6ea0d71e7620', 'Linux': '600e9b4f-f0bc-43cf-b233-43949b5b5db6', 'C': '7b79bf54-37a4-429e-8016-f31d7063e0fd', 'AWS': '5bd78896-9d80-48a8-bcf1-021bbbb94b28', 'Docker': '0f25890a-9a90-4e46-b4ca-4cfb7337d74e', 'Python': 'e17c7cee-b9b3-4dde-b247-60c98721a10a', 'NoSQL': '2f755bef-ff5f-4ad7-8a3d-b0f

In [29]:
from rag_handler.neo4j_orchestrator import Neo4jOrchestrator
neo4_orchestrator = Neo4jOrchestrator(
    neo4j_url=neo4j_uri,
    auth=(neo4j_username, neo4j_password)
)
neo4_orchestrator.ingest_to_neo4j(nodes, relationships)

Ingesting 229 nodes and 268 relationships...


2025-11-23 11:23:51,140 - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 1, column: 1, offset: 0} for query: 'MATCH (a:Entity {id: $source_id}), (b:Entity {id: $target_id}) MERGE (a)-[r:leverages]->(b) ON CREATE SET r.original_type = $type, r.source_file = $source_file ON MATCH SET r.original_type = $type, r.source_file = $source_file'
2025-11-23 11:23:51,

✓ Successfully ingested 229 nodes and 268 relationships!


{'Data Scientist': 'fb08bc87-01ca-4aaf-8d1f-6ea5e59d3270',
 'advanced analytics': 'e932ac40-ceb5-4765-b94d-66d4c467aee2',
 'biology': '9c15037a-0da8-4f1e-bdcc-43a4cd34fce6',
 'medicine': '672cf5f5-62ee-4723-82e5-825fbd88e8a2',
 'machine learning': '0262f53e-d204-4c04-9616-ba2824f9f958',
 'statistical modeling': '4ffb001c-d757-476c-a2cc-662b1b3cee41',
 'data-informed decision-making': '05240bc7-ad1e-493d-a863-2885c89059c9',
 'actionable insights': '13a166c0-0505-4b20-97de-d373224cca8f',
 'complex datasets': 'd9cd03bc-d563-440a-a7f8-60876e8c3dbf',
 'technical stakeholders': '0de968a2-d255-41a0-9342-dab0ce538d9d',
 'non-technical stakeholders': '02a1f2f0-bf6d-4db6-bd97-6ea0d71e7620',
 'Linux': '600e9b4f-f0bc-43cf-b233-43949b5b5db6',
 'C': '7b79bf54-37a4-429e-8016-f31d7063e0fd',
 'AWS': '5bd78896-9d80-48a8-bcf1-021bbbb94b28',
 'Docker': '0f25890a-9a90-4e46-b4ca-4cfb7337d74e',
 'Python': 'e17c7cee-b9b3-4dde-b247-60c98721a10a',
 'NoSQL': '2f755bef-ff5f-4ad7-8a3d-b0fac8ff614e',
 'SQL': '39840

---
## Embedding Tester

In [23]:
text_chunks

[{'file': 'raw_text.txt',
  'chunks': ["The glob module in Python is used to find file paths that match a specified pattern using Unix shell-style wildcards.\n The primary function, glob.glob(), returns a list of path names matching a given pattern.\n For example, glob.glob('*.txt') will return all files in the current directory ending with .txt.",
   "To search recursively through subdirectories, the recursive parameter must be set to True, and the pattern ** is used to match any files and zero or more directories and subdirectories.\n For instance, glob.glob('**/*.py', recursive=True) will find all Python files in the current directory and all its subdirectories.\n The ** pattern can match across multiple directory levels, unlike *, which only matches files at the same directory level.",
   'An alternative to glob.glob() is glob.iglob(), which returns an iterator instead of a list, making it more memory-efficient for large searches.\n This is particularly useful when dealing with a l

In [24]:
# Embed all chunks
embeddings = chunk_embedder.embed_chunks(text_chunks)  # List[List[float]]

[92m11:20:14 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-23 11:20:14,561 - INFO - Wrapper: Completed Call, calling success_handler
[92m11:20:16 - LiteLLM:INFO[0m: utils.py:1307 - Wrapper: Completed Call, calling success_handler
2025-11-23 11:20:16,323 - INFO - Wrapper: Completed Call, calling success_handler


In [25]:
type(embeddings)

list

In [26]:
embeddings[1]

{'file': 'short_story.txt',
 'chunks': ['In the town of Mipple-Moo, where roofs were square and doors were blue, lived a tiny girl named Tindle Tock who dreamed of catching the moon in a sock. Every night, she’d climb the Lolly Tree, the tallest tree for miles to see, waving her net and shouting, “Hey Moon! Come play before it’s half past noon!”',
  'But the moon, as moons often do, just blinked and softly said, “A-choo! Dear Tindle Tock, I can’t come near—earth’s too wobbly, dark, and clear!” Still, Tindle didn’t quit or sigh. She schemed and planned beneath the sky, sketching blueprints made of thread and glue, a moon-catcher shaped like a shoe.',
  'Her friends arrived—Finkle Fox with socks of green and stripes of clocks, Wobble the Croc who juggled stones, and Pib the Pig with silver bones. “You’ll never catch that moon, my dear,” said Croc, while twirling ear to ear. “It’s slippery, high, and far too bright! It only comes out late at night!”',
  'But Tindle grinned, her eyes aligh