Activate venv and install packages
1. !source reasongraph/bin/activate </br>
2. !pip install "langchain>=0.3" "langgraph>=0.2" qdrant-client sentence-transformers torch pydantic python-dotenv
3. !pip install pymupdf tiktoken # Count number of tokens to check model compatibility

In [None]:
import fitz, hashlib, torch, uuid, os, sys
from langgraph.graph import StateGraph, END
from pydantic import BaseModel, Field
from typing import Any, Dict, List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from pathlib import Path

In [None]:
class RAGState(BaseModel):
    docs: List[str] = Field(default_factory=list)
    chunks: List[Dict[str, Any]] = Field(default_factory=list) # Includes metadata
    query: Optional[str] = None
    results: List[Dict[str, Any]] = Field(default_factory=list)
    answer: Optional[str] = None
    retry_count: int = 0

In [None]:
try:
    TOKEN_SPLIT_AVAILABLE = True
except Exception:
    TOKEN_SPLIT_AVAILABLE = False

def _hash_text(t: str) -> str:
    """페이지 중복 제거용 해시 함수"""
    return hashlib.sha256(t.strip().encode("utf-8")).hexdigest()

def load_and_chunk(state: RAGState, folder: str = "docs",
                   chunk_size_tokens: int = 350,
                   chunk_overlap_tokens: int = 50) -> RAGState:
    texts, chunks, seen_hashes = [], [], set()

    for file in Path(folder).rglob("*.pdf"):
        try:
            with fitz.open(file) as pdf:
                for i, page in enumerate(pdf, start=1):
                    text = page.get_text("text")
                    if not text or not text.strip():
                        continue

                    # Prevent page number duplication
                    h = _hash_text(text)
                    if h in seen_hashes:
                        continue
                    seen_hashes.add(h)

                    texts.append(text)

                    # Select tokenizer (token/letter)
                    if TOKEN_SPLIT_AVAILABLE:
                        splitter = TokenTextSplitter(
                            chunk_size=chunk_size_tokens,
                            chunk_overlap=chunk_overlap_tokens,
                            encoding_name="cl100k_base"
                        )
                        page_chunks = splitter.split_text(text)
                    else:
                        splitter = RecursiveCharacterTextSplitter(
                            chunk_size=800,
                            chunk_overlap=100
                        )
                        page_chunks = splitter.split_text(text)

                    # metadata saved to each chunk
                    for ci, ch in enumerate(page_chunks):
                        chunks.append({
                            "text": ch,
                            "source": file.name,
                            "page": i,
                            "chunk_index": ci
                        })

        except Exception as e:
            print(f"Failed to load PDF: {file.name} ({e})")

    if not chunks:
        print('No valid PDF chunks found in "docs" directory.')
        return state

    print(f"Loaded {len(texts)} pages and created {len(chunks)} chunks.")
    state.docs = texts
    state.chunks = chunks
    return state


In [None]:
# Use GPU to run if possible
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

In [None]:
# Specify Embedding models and load models to memory
QWEN_MODEL = "Qwen/Qwen3-Embedding-0.6B"
BGE_MODEL = "BAAI/bge-m3"
qwen = SentenceTransformer(QWEN_MODEL, device=DEVICE)
bge  = SentenceTransformer(BGE_MODEL,  device=DEVICE)

In [None]:
# Check vector dimension of each model (for qdrant collection)
QWEN_DIM = qwen.get_sentence_embedding_dimension()
BGE_DIM  = bge.get_sentence_embedding_dimension()

In [None]:
# Qdrant server connection configuration (localhost:6333)
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")  # use default if no environment variables
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
COLLECTION  = "pkyoo_personal_docs_dualvec"                # vector collection name

In [None]:
# Qdrant client reset and connection test
try:
    client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=60)
    _ = client.get_collections()  # get collection lists to check connection
except Exception as e:
    print("Failed to connect Qdrant. Check Qdrant Docker is running.")
    print("e.g., docker run -p 6333:6333 -v $(pwd)/qdrant_storage:/qdrant/storage qdrant/qdrant")
    raise

In [None]:
existing = [c.name for c in client.get_collections().collections]

In [None]:
# Check collection existence and create collections if not
if COLLECTION not in existing:
    client.create_collection(
        collection_name=COLLECTION,
        vectors_config={
            "qwen": qmodels.VectorParams(size=QWEN_DIM, distance=qmodels.Distance.COSINE),
            "bge":  qmodels.VectorParams(size=BGE_DIM,  distance=qmodels.Distance.COSINE)
        }
    )
    print(f"Created collection: {COLLECTION}")
else:
    print(f"ℹCollection exists: {COLLECTION}")

Check http://localhost:6333/dashboard for local qdrant collections dashboard

In [None]:
# Stop the Embedding model for GPU memories. bge will still be used retrieval.
# del qwen
# torch.cuda.empty_cache()

In [None]:
def embed_and_store(state: RAGState, batch_size=128, upsert_batch=2048) -> RAGState:
    if not state.chunks:
        print("No chunks found. Run load_and_chunk() first.")
        return state

    # 1) 임베딩
    print("Encoding with Qwen3-Embedding-0.6B ...")
    qwen_vecs = qwen.encode(
        state.chunks, batch_size=batch_size,
        show_progress_bar=True, normalize_embeddings=True
        )
    print("Encoding with bge-m3 ...")
    bge_vecs = bge.encode(
        state.chunks, batch_size=batch_size,
        show_progress_bar=True, normalize_embeddings=True
        )

    # 2) 포인트 준비 (메타 포함)
    points = []
    for i, (qv, bv) in enumerate(zip(qwen_vecs, bge_vecs)):
        meta = getattr(state, "_chunk_meta", None)
        source = meta[i]["source"] if meta else None
        page = meta[i]["page"] if meta else None

        payload = {
            "text": state.chunks[i],
            "chunk_index": i,
            "source": source,
            "page": page,
        }
        points.append(qmodels.PointStruct(
            id=str(uuid.uuid4()),
            vector={"qwen": qv.tolist(), "bge": bv.tolist()},
            payload=payload
        ))

    # 3) 배치 업서트
    print(f"Upserting {len(points)} vectors → {COLLECTION}")
    for s in range(0, len(points), upsert_batch):
        client.upsert(collection_name=COLLECTION, points=points[s:s+upsert_batch], wait=True)
    print("Upsert finished.")
    return state

# 임베딩이 끝난 후에만 qwen을 비움
# del qwen; torch.cuda.empty_cache() 는 embed_and_store 호출 이후 실행할 것.


In [None]:
def retrieve_from_qdrant(state: RAGState, top_k: int = 5) -> RAGState:
    # Retrieves top-k most relevant document chunks from Qdrant
    # based on the query embedding generated by bge-m3 (RTEB model).

    if not state.query:
        print("No user query provided in state.query")
        return state

    print("Generating query embedding using bge-m3")

    # Step 1: Encode the query text using the retrieval embedding model (bge-m3)
    query_vec = bge.encode(
        [state.query],
        normalize_embeddings=True  # cosine similarity requires normalized vectors
    )[0]

    # Step 2: Search in Qdrant using the 'bge' vector field
    # The 'vector' argument must match the name used during embedding
    print(f"Searching Qdrant collection '{COLLECTION}' ...")
    hits = client.search(
        collection_name=COLLECTION,
        query_vector=("bge", query_vec),
        limit=top_k
    )

    # Step 3: Extract the retrieved texts (payloads), Store text, metadata, score
    results = []
    for h in hits[:top_k]:
        results.append({
            "text": h.payload.get("text", ""),
            "score": getattr(h, "score", None),
            "id": getattr(h, "id", None),
            "source": h.payload.get("source"),
            "page": h.payload.get("page"),
            "chunk_index": h.payload.get("chunk_index")
        })

    # Step 4: Store retrieved chunks in RAG state
    state.results = results
    print(f"Retrieved {len(results)} chunks.")
    return state


In [None]:
import numpy as np
from sklearn.metrics import ndcg_score

In [None]:
def evaluate_retrieval_ranked(state: RAGState, top_k: int = 5,
                              relevance_threshold: float = 0.4,
                              ndcg_threshold: float = 0.6,
                              mrr_threshold: float = 0.5,
                              use_qdrant_scores: bool = False) -> str:
    # Evaluate retrieval quality using ranking metrics (nDCG@k, MRR).
    # Returns 'generate' if the retrieval quality is good enough,
    # otherwise 'rewrite' to trigger a query refinement step.

    if not state.results:
        print("No retrieved chunks to evaluate. Query rewrite required.")
        return "rewrite"

    print("Evaluating retrieval quality using ranking metrics...")
    
    # Evaluate the top_k retrieved texts
    results_k = state.results[:top_k]
    texts = [r["text"] for r in results_k]

    # Step 1: Encode the query vector using RTEB model (bge-m3)
    query_vec = bge.encode([state.query], normalize_embeddings=True)[0]

    # Step 2: Encode the retrieved chunks
    retrieved_vecs = bge.encode(texts, normalize_embeddings=True)

    # Step 3: Compute cosine similarity for each chunk
    sims = np.dot(retrieved_vecs, query_vec)

    # Step 4: Derive relevance labels (1 if above threshold, else 0)
    relevance = (sims >= relevance_threshold).astype(int)

    # Step 5: Compute ranking metrics (scores-predicted value vs relevance)
    preds = np.array([r["score"] for r in results_k]) if use_qdrant_scores else sims
    ndcg = ndcg_score([relevance], [preds])
    
    if np.any(relevance == 1):
        first_relevant_idx = int(np.argmax(relevance == 1))
        reciprocal_rank = 1.0 / (first_relevant_idx + 1)
    else:
        reciprocal_rank = 0.0

    print(f"nDCG@{top_k}: {ndcg:.3f}, MRR: {reciprocal_rank:.3f}")

    # Step 6: Decision logic
    if ndcg >= ndcg_threshold or reciprocal_rank >= mrr_threshold:
        print("Retrieval ranking is satisfactory. Proceeding to generation.")
        return "generate"
    else:
        print("Retrieval ranking is poor. Triggering query rewrite.")
        return "rewrite"


In [None]:
def rewrite_query(state: RAGState) -> RAGState:
    # Use Llama3 to rephrase the query semantically while keeping intent.
    prompt = f"""
    You are a query rewriter for a retrieval system.
    Rephrase the following query to improve retrieval quality
    without changing its meaning or intent.

    Query:
    "{state.query}"
    """

    new_query = llama3.generate(prompt)  # pseudo-call
    new_query = (new_query or "").strip()

    print(f"""
          Rewritten query (attempt {state.retry_count + 1}):
          "{new_query}"
          """)
    state.query = new_query
    return state

Loop stops when number of rewrites = 5 to limit the response time and endless querying.

In [None]:
def retrieval_loop(state: RAGState, max_retries: int = 5, top_k: int = 5) -> RAGState:
    # Full retrieval + evaluation + rewrite loop to prevent hallucination.
    while state.retry_count <= max_retries:
        print(f"\n[Attempt {state.retry_count + 1}] Retrieving and evaluating...")
        
        # Step 1: Retrieve
        state = retrieve_from_qdrant(state, top_k=top_k)

        # Step 2: Evaluate
        result = evaluate_retrieval_ranked(state, top_k=top_k)

        if result == "generate":
            print("Retrieval sufficient to proceed LLM generation.")
            return state  # Pass to generation stage

        # Step 3: If evaluation fails, rewrite query
        if state.retry_count < max_retries:
            print("Rewriting query and retrying retrieval...")
            state = rewrite_query(state)
            state.retry_count += 1
        else:
            print("Retrieval failed after max attempts. No relevant info found.")
            state.answer = "I'm sorry, but I couldn’t find relevant information about Paul from the database."
            return state
    return state

In [None]:
from langchain.llms import Ollama
llama3 = Ollama(model="llama3", temperature=0.2)

In [None]:
def generate_answer(state: RAGState, model_name: str = "llama3", max_context: int = 5) -> RAGState:
    """
    Generate an answer using retrieved chunks as context.
    Uses a local Llama3 model (via Ollama) for answer generation.
    """

    # Step 1: Retrieve top-k context chunks (from previous retrieval step)
    if not state.results:
        print("No retrieval results found. Run retrieval_loop() first.")
        state.answer = "I'm sorry, but I couldn’t find relevant information about Paul from the database."
        return state

    # Limit context to top N results for efficiency
    top_contexts = [r["text"] for r in state.results[:max_context]]
    combined_context = "\n\n".join(top_contexts)

    # Step 2: Build a system + user prompt
    prompt = f"""
    You are a precise and concise AI assistant specialized in retrieval-augmented generation.
    Use the following context extracted from trusted documents to answer the user's query accurately.
    If the context does not contain sufficient information, clearly say so without hallucinating.

    --- Context ---
    {combined_context}

    --- User Query ---
    {state.query}

    --- Instruction ---
    1. Base your answer only on the given context.
    2. DO NOT invent facts not present in the documents(context).
    3. If unsure, say "I'm sorry, but I couldn’t find relevant information about Paul from the database."
    4. Return your final answer clearly and concisely.
    """

    print("Generating answer...")
    try:
        response = model_name.invoke(prompt)
        answer_text = response.strip()
    except Exception as e:
        print(f"Failed to connect GenAI Model: {e}")
        answer_text = "Error: failed to generate answer due to model connection or runtime issue."

    # Step 3: Store generated answer
    state.answer = answer_text

    # Step 4: Log summary
    print("\n====== Generated Answer ======")
    print(" ")
    print(answer_text)
    print(" ")
    print("================================\n")

    return state
