## PoC for advanced RAG with PostgreSQL

___
### Activate python virtual env

In [None]:
%source ~/path-to-your-project/llamaindex-venv/bin/activate

___
___
___
## Setup Postgres and Dependencies

In [None]:
%pip install llama-index-vector-stores-postgres

In [None]:
import os
import getpass
import subprocess

def run_sudo(cmd, sudo_password, check=True):
    """Run a command with sudo -S, providing password via stdin."""
    return subprocess.run(
        ["sudo", "-S"] + cmd,
        input=(sudo_password + "\n"),
        text=True,
        capture_output=True,
        check=check,
        cwd="/tmp",
    )

# --- passwords ---
sudo_password = getpass.getpass("Provide sudo password: ")
postgres_pw = getpass.getpass("Provide PostgreSQL password for user 'postgres': ")

In [None]:
# --- system packages ---
run_sudo(["apt", "update"], sudo_password)
run_sudo(["apt", "install", "-y", "postgresql-common"], sudo_password)
print("✅ system packages")

# Add PostgreSQL APT repo helper (from postgresql-common)
run_sudo(["/usr/share/postgresql-common/pgdg/apt.postgresql.org.sh"], sudo_password)
print("✅ PostgreSQL APT repo helper")

# Install PostgreSQL + pgvector
command = "sudo -S apt install postgresql-15-pgvector"
os.system(f'echo "{sudo_password}" | {command}')
# run_sudo(["apt", "install", "-y", "postgresql", "postgresql-15-pgvector"], sudo_password)
print("✅ Install PostgreSQL + pgvector")

## Start and enable PostgreSQL service:
Ensures the DB server is running and starts automatically on reboot

In [None]:
# Ensure service is running
run_sudo(["systemctl", "enable", "--now", "postgresql"], sudo_password)
print("✅ service is running")

# --- set postgres user password ---
sql_set_pw = f"ALTER USER postgres WITH PASSWORD '{postgres_pw}';"
res = subprocess.run(
    ["sudo", "-S", "-u", "postgres", "psql", "-c", sql_set_pw],
    input=(sudo_password + "\n"),
    text=True,
    check=True,
    cwd="/tmp",
)
# print("Return code:", res.returncode)
# print("STDOUT:\n", res.stdout)
# print("STDERR:\n", res.stderr)
print("✅ set postgres user password")

✅ service is running
ALTER ROLE
✅ set postgres user password


## Create the database

In [None]:
# --- create database (idempotent) ---
sql_create_db = "CREATE DATABASE vector_db;"
# If DB exists, CREATE DATABASE fails; so check first with psql:
sql_create_db_safe = """
DO $$
BEGIN
   IF NOT EXISTS (SELECT FROM pg_database WHERE datname = 'vector_db') THEN
      CREATE DATABASE vector_db;
   END IF;
END $$;
"""
subprocess.run(
    ["sudo", "-S", "-u", "postgres", "psql", "-c", sql_create_db_safe],
    input=(sudo_password + "\n"),
    text=True,
    check=True,
    cwd="/tmp",
)
print("✅ create database")

DO
✅ create database


### Connect to vector_db and enable pgvector

In [None]:
import psycopg2

# --- connect with psycopg2 to the new DB and enable pgvector extension ---
connection_string=f"postgresql://postgres:{postgres_pw}@localhost:5432"

db_name = "vector_db"
conn = psycopg2.connect(
    dbname=db_name,
    user="postgres",
    password=postgres_pw,
    host="localhost",
    port=5432,
)
conn.autocommit = True

with conn.cursor() as c:
    # c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    # c.execute(f"CREATE DATABASE {db_name}")
    c.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    c.execute("SELECT extname, extversion FROM pg_extension WHERE extname='vector';")
    print("pgvector extension:", c.fetchone())

conn.close()

print("✅ PostgreSQL + pgvector ready. DB: vector_db, user: postgres")

pgvector extension: ('vector', '0.8.2')
✅ PostgreSQL + pgvector ready. DB: vector_db, user: postgres


___
___
___

# RAG pipeline 

### Load credentials

In [None]:
from getpass import getpass

if "LLAMA_CLOUD_API_KEY" not in os.environ:
    os.environ["LLAMA_CLOUD_API_KEY"] = getpass("Enter your Llama Cloud API Key: ")

OPENAI_KEY = ""
if OPENAI_KEY == "":
    OPENAI_KEY = getpass("Enter your OpenAI API Key: ")

### Import libraries/packages

In [None]:
import os

import nest_asyncio
nest_asyncio.apply()

# Parse
from llama_cloud_services import LlamaParse
from copy import deepcopy

# Models
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

# vector index
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine

# kg index
from llama_index.core import PropertyGraphIndex
from llama_index.core.indices.property_graph import VectorContextRetriever

# Extractors
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
)

# Custom retriever
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, Document
from typing import List

# Retrievers
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.graph_stores.neo4j import Neo4jPGStore

# agent
from llama_index.core.tools import QueryEngineTool, ToolMetadata
# from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent.workflow import ReActAgent
from llama_index.core.workflow import Context
from llama_index.core.agent.workflow import ToolCallResult, AgentStream


_______________________________
### Setup Models
Here we use gpt-4o and default OpenAI embeddings.
_______________________________

In [None]:
llm_model = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm_model
Settings.embed_model = embed_model

# Best chunk-configuration 
Settings.chunk_size = 1024
Settings.chunk_overlap = 200
print(Settings.context_window)

_______________________________
### 1. Parsing (``parse into document``)
_______________________________

##### Load and parse Data with agent

In [None]:
docs = LlamaParse(
    parse_mode="parse_page_with_agent",
    # model="openai-gpt-4-1-mini",
    model="anthropic-sonnet-4.0",
    high_res_ocr=True,
    adaptive_long_table=True,
    outlined_table_extraction=True,
    output_tables_as_HTML=True,
).load_data("../data/bevel_gear.pdf")

_______________________________
### 2. Splitting (``manual split``)
_______________________________

##### Split by page

In [None]:
def get_sub_docs(docs):
    sub_docs = []
    for doc in docs:
        page_chunks = doc.text.split("\n---\n")
        for i, chunk in enumerate(page_chunks):
            md = deepcopy(doc.metadata)

            # ensure page_number stays correct at page level
            md["page_number"] = md.get("page_number", i + 1)

            sub_docs.append(
                Document(
                    text=chunk,
                    metadata=md,
                )
            )
    return sub_docs

sub_docs = get_sub_docs(docs)

_______________________________
### 3. Indexing
_______________________________

##### 3.1 Vector-based

In [None]:
base_index = VectorStoreIndex.from_documents(
    sub_docs, 
    embed_model=embed_model,
    vector_store='',  # if not specified, embeddings live in RAM
    # vector_store=faiss_store,  # or Pinecone / Weaviate
)

##### 3.2 KG-based

##### 3.2.1 Initialize Graph Store

To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command:

```bash
docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
```

From here, you can open the db at [http://localhost:7474/](http://localhost:7474/). On this page, you will be asked to sign in. Use the default username/password of `neo4j` and `neo4j`.
Once you login for the first time, you will be asked to change the password.

After this, you are ready to create your first property graph!

##### 3.2.2 Extract entities/relations

In [None]:
neo4j_graph_store_pw = getpass("Enter your Neo4j Graph Store Password: ")

graph_store = Neo4jPGStore(
    username="neo4j",
    password=neo4j_graph_store_pw,                   # your password
    url="bolt://localhost:7687",
)
vec_store = None

index = PropertyGraphIndex.from_documents(
    sub_docs,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    property_graph_store=graph_store,
    show_progress=True,
    # vector_store=vector_store,
    # embed_kg_nodes=True,
)

_______________________________
### 4. Retrieval
_______________________________

##### 4.1 Vector retriever (embeddings similarity)

In [None]:
vector_retriever = base_index.as_retriever(similarity_top_k=10)
naive_query_engine = RetrieverQueryEngine(vector_retriever)

# Query
response = naive_query_engine.query(
    "Worum geht es in dem Dokument? Antworte in 2-3 Sätzen."
    "Aus welchem Material besteht das Kegelrad?"
)
print(str(response))

##### 4.2 Hybrid retriever (BM25 keyword + vector)

In [None]:
bm25 = BM25Retriever.from_documents(
    sub_docs,
    similarity_top_k=10,
)
hybrid_retriever = QueryFusionRetriever(
    retrievers=[bm25, vector_retriever],
    similarity_top_k=10,
)
hybrid_query_engine = RetrieverQueryEngine.from_args(retriever=hybrid_retriever)

# Query
response = hybrid_query_engine.query(
    "Worum geht es in dem Dokument? Antworte in 2-3 Sätzen."
    "Aus welchem Material besteht das Kegelrad?"
)
print(str(response))

##### 4.3 Knowledge graph retriever

In [None]:
kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    similarity_top_k=5,
    path_depth=1,
    # include_text=False,
    include_text=True,
)

nodes = kg_retriever.retrieve(
    "Gib mir die ganze Reihe für den Kegelrad aus Zink mit M=2,0."
)

print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")

#### 4.4 Custom retriever (vector+KG)

In [None]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both KG vector search and direct vector search."""

    def __init__(self, kg_retriever, vector_retriever):
        self._kg_retriever = kg_retriever
        self._vector_retriever = vector_retriever

    def _retrieve(self, query_bundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        kg_nodes = self._kg_retriever.retrieve(query_bundle)
        vector_nodes = self._vector_retriever.retrieve(query_bundle)

        unique_nodes = {n.node_id: n for n in kg_nodes}
        unique_nodes.update({n.node_id: n for n in vector_nodes})
        return list(unique_nodes.values())
custom_retriever = CustomRetriever(kg_retriever, vector_retriever)

nodes = custom_retriever.retrieve(
    "Gib mir die ganze Reihe für den Kegelrad aus Zink mit M zwischen 1,0 und 2,0 und ZB=6,9 mm"
)

print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")