# Basic Structuring for Pipeline

## Install necessary LLM and libraries

Activate venv and install packages
1. !source reasongraph/bin/activate </br>
2. !pip install "langchain>=0.3" "langgraph>=0.2" qdrant-client sentence-transformers torch pydantic python-dotenv
3. !pip install pymupdf tiktoken # Count number of tokens to check model compatibility

Install ollama for answer generation and evaluation
1. !curl -fsSL https://ollama.com/install.sh | sh
2. !ollama pull llama3 # 8B Parameters, about 5GB

In [1]:
import fitz, hashlib, torch, uuid, os, re, textwrap
from pydantic import BaseModel, Field
from typing import Any, Dict, List, Optional
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from langchain_community.llms import Ollama
from deepeval.models import OllamaModel
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric, BaseMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
import numpy as np
from sklearn.metrics import ndcg_score

## RAG State Definition

In [2]:
class RAGState(BaseModel):
    docs: List[str] = Field(default_factory=list)
    chunks: List[Dict[str, Any]] = Field(default_factory=list)
    query: Optional[str] = None
    results: List[Dict[str, Any]] = Field(default_factory=list)
    answer: Optional[str] = None
    retry_count: int = 0
    status: Optional[str] = None

In [3]:
# --- Token Splitter Availability Check ---
try:
    _ = TokenTextSplitter
    TOKEN_SPLIT_AVAILABLE = True
except Exception:
    TOKEN_SPLIT_AVAILABLE = False


## Document Loading and Chunking

In [4]:
def _hash_text(t: str) -> str:
    return hashlib.sha256(t.strip().encode("utf-8")).hexdigest()

In [5]:
def load_and_chunk(state: RAGState, folder: str = "docs",
                   chunk_size_tokens: int = 350,
                   chunk_overlap_tokens: int = 50) -> RAGState:
    texts, chunks, seen_hashes = [], [], set()
    print("\n\033[1;42m--- Start Loading Docs and Chunking ---\033[0m")

    for file in Path(folder).rglob("*.pdf"):
        try:
            with fitz.open(file) as pdf:
                for i, page in enumerate(pdf, start=1):
                    text = page.get_text("text")
                    if not text.strip():
                        continue
                    h = _hash_text(text)
                    if h in seen_hashes: continue
                    seen_hashes.add(h)
                    texts.append(text)
                    if TOKEN_SPLIT_AVAILABLE:
                        splitter = TokenTextSplitter(
                            chunk_size=chunk_size_tokens, chunk_overlap=chunk_overlap_tokens, encoding_name="cl100k_base"
                        )
                        page_chunks = splitter.split_text(text)
                    else:
                        splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
                        page_chunks = splitter.split_text(text)
                    for ci, ch in enumerate(page_chunks):
                        chunks.append({"text": ch, "source": file.name, "page": i, "chunk_index": ci})
        except Exception as e:
            print(f"Failed to load PDF: {file.name} ({e})")

    if not chunks:
        print('\n\033[1;42m--- No valid PDF chunks found in "docs" directory.---\033[0m')
        return state
    print(f"\n\033[1;42m---Loaded {len(texts)} pages and created {len(chunks)} chunks---\033[0m")
    state.docs, state.chunks = texts, chunks
    return state

In [6]:
# def load_and_chunk(state: RAGState, folder: str = "docs",
#                    chunk_size_tokens: int = 350,
#                    chunk_overlap_tokens: int = 50) -> RAGState:
#     texts, chunks, seen_hashes = [], [], set()

#     print("\n\033[1;42m--- Start Loading Docs and Chunking ---\033[0m")

#     for file in Path(folder).rglob("*.pdf"):
#         try:
#             with fitz.open(file) as pdf:
#                 for i, page in enumerate(pdf, start=1):
#                     text = page.get_text("text")
#                     if not text.strip():
#                         continue

#                     # Prevent page number duplication
#                     h = _hash_text(text)
#                     if h in seen_hashes:
#                         continue

#                     seen_hashes.add(h)
#                     texts.append(text)

#                     # Select tokenizer (token/letter)
#                     if TOKEN_SPLIT_AVAILABLE:
#                         splitter = TokenTextSplitter(
#                             chunk_size=chunk_size_tokens,
#                             chunk_overlap=chunk_overlap_tokens,
#                             encoding_name="cl100k_base"
#                         )
#                         page_chunks = splitter.split_text(text)
#                     else:
#                         splitter = RecursiveCharacterTextSplitter(
#                             chunk_size=800,
#                             chunk_overlap=100
#                         )
#                         page_chunks = splitter.split_text(text)

#                     # metadata saved to each chunk
#                     for ci, ch in enumerate(page_chunks):
#                         chunks.append({
#                             "text": ch,
#                             "source": file.name,
#                             "page": i,
#                             "chunk_index": ci
#                         })

#         except Exception as e:
#             print(f"Failed to load PDF: {file.name} ({e})")

#     if not chunks:
#         print('\n\033[1;42m--- No valid PDF chunks found in "docs" directory.---\033[0m')
#         return state

#     print("\n\033[1;42m---Loaded {len(texts)} pages and created {len(chunks)} chunks---\033[0m")
#     state.docs = texts
#     state.chunks = chunks
#     return state


## Qdrant Embedding

In [7]:
# Use GPU to run if possible
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cuda


In [8]:
# Specify Embedding models and load models to memory
QWEN_MODEL = "Qwen/Qwen3-Embedding-0.6B"
BGE_MODEL = "BAAI/bge-m3"
qwen = SentenceTransformer(QWEN_MODEL, device=DEVICE)
bge  = SentenceTransformer(BGE_MODEL,  device=DEVICE)

In [9]:
# Check vector dimension of each model (for qdrant collection)
QWEN_DIM = qwen.get_sentence_embedding_dimension()
BGE_DIM  = bge.get_sentence_embedding_dimension()

In [10]:
# Qdrant server connection configuration (localhost:6333)
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")  # use default if no environment variables
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
COLLECTION  = "pkyoo_personal_docs_dualvec"                # vector collection name

In [11]:
# Qdrant client reset and connection test
try:
    client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=60)
    _ = client.get_collections()  # get collection lists to check connection
except Exception as e:
    print("Failed to connect Qdrant. Check Qdrant Docker is running.")
    print("e.g., docker run -p 6333:6333 -v $(pwd)/qdrant_storage:/qdrant/storage qdrant/qdrant")
    raise

In [12]:
existing = [c.name for c in client.get_collections().collections]

In [13]:
# Check collection existence and create collections if not
if COLLECTION not in existing:
    client.create_collection(
        collection_name=COLLECTION,
        vectors_config={
            "qwen": qmodels.VectorParams(size=QWEN_DIM, distance=qmodels.Distance.COSINE),
            "bge":  qmodels.VectorParams(size=BGE_DIM,  distance=qmodels.Distance.COSINE)
        }
    )
    print(f"Created collection: {COLLECTION}")
else:
    print(f"ℹCollection exists: {COLLECTION}")

ℹCollection exists: pkyoo_personal_docs_dualvec


Check http://localhost:6333/dashboard for local qdrant collections dashboard

In [14]:
# Stop the Embedding model for GPU memories. bge will still be used retrieval.
# del qwen
# torch.cuda.empty_cache()

In [15]:
def embed_and_store(state: RAGState, batch_size=128, upsert_batch=2048) -> RAGState:
    print("\n\033[1;42m--- Start Embedding and Storing ---\033[0m")

    if not state.chunks:
        print("No chunks found. Run load_and_chunk() first.")
        return state

    # Embedding only the text from chunk (No metadata)
    texts_only = [c["text"] for c in state.chunks]

    print("Encoding with Qwen3-Embedding-0.6B ...")
    qwen_vecs = qwen.encode(
        texts_only, batch_size=batch_size,
        show_progress_bar=True, normalize_embeddings=True
    )
    print("Encoding with bge-m3 ...")
    bge_vecs = bge.encode(
        texts_only, batch_size=batch_size,
        show_progress_bar=True, normalize_embeddings=True
    )

    # Points (Use metadate from chunk dict)
    points = []
    for i, (qv, bv) in enumerate(zip(qwen_vecs, bge_vecs)):
        ch = state.chunks[i]
        payload = {
            "text": ch["text"],               # Only text saved in payload
            "chunk_index": ch["chunk_index"],
            "source": ch.get("source"),
            "page": ch.get("page"),
        }
        points.append(qmodels.PointStruct(
            id=str(uuid.uuid4()),
            vector={"qwen": qv.tolist(), "bge": bv.tolist()},
            payload=payload
        ))

    print(f"Upserting {len(points)} vectors → {COLLECTION}")
    for s in range(0, len(points), upsert_batch):
        client.upsert(collection_name=COLLECTION, points=points[s:s+upsert_batch], wait=True)
    print("Upsert finished.")
    print("\n\033[1;42m--- End Embedding and Storing process ---\033[0m")
    return state


## Retrieval from Qdrant

In [16]:
def retrieve_from_qdrant(state: RAGState, top_k: int = 5) -> RAGState:
    # Retrieves top-k most relevant document chunks from Qdrant
    # based on the query embedding generated by bge-m3 (RTEB model).

    print("\n\033[1;42m--- Start Retrieval from Qdrant Process ---\033[0m")

    if not state.query:
        print("No user query provided in state.query")
        return state

    print("Generating query embedding using bge-m3")

    # Step 1: Encode the query text using the retrieval embedding model (bge-m3)
    query_vec = bge.encode(
        [state.query],
        normalize_embeddings=True  # cosine similarity requires normalized vectors
    )[0]

    # Step 2: Search in Qdrant using the 'bge' vector field
    # The 'vector' argument must match the name used during embedding
    print(f"Searching Qdrant collection '{COLLECTION}' ...")
    hits = client.query_points(
        collection_name=COLLECTION,
        query=query_vec,
        using="bge",
        limit=top_k
    ).points

    # Step 3: Extract the retrieved texts (payloads), Store text, metadata, score
    results = []
    for h in hits:
        raw_text = h.payload.get("text", "")

        # Prevent the non-text dict saved payloads just in case
        if isinstance(raw_text, dict):
            raw_text = raw_text.get("text", "")
            
        results.append({
            "text": raw_text,
            "score": getattr(h, "score", None),
            "id": getattr(h, "id", None),
            "source": h.payload.get("source"),
            "page": h.payload.get("page"),
            "chunk_index": h.payload.get("chunk_index")
        })

    # Step 4: Store retrieved chunks in RAG state
    state.results = results
    print(f"Retrieved {len(results)} chunks.")
    print("\n\033[1;42m--- End Retrieval from Qdrant process ---\033[0m")
    return state


### Evaluate Retrieval Quality

In [17]:
def evaluate_retrieval_ranked(state: RAGState, top_k: int = 5,
                              relevance_threshold: float = 0.4,
                              ndcg_threshold: float = 0.6,
                              mrr_threshold: float = 0.5,
                              use_qdrant_scores: bool = False) -> str:
    # Evaluate retrieval quality using ranking metrics (nDCG@k, MRR).
    # Returns 'generate' if the retrieval quality is good enough,
    # otherwise 'rewrite' to trigger a query refinement step.

    print("\n\033[1;42m--- Start Retrieval Evaluation Process ---\033[0m")
    if not state.results:
        print("No retrieved chunks to evaluate. Query rewrite required.")
        return "rewrite"

    print("Evaluating retrieval quality using ranking metrics...")
    
    # Evaluate the top_k retrieved texts
    results_k = state.results[:top_k]
    texts = [r["text"] for r in results_k]

    # Step 1: Encode the query vector using RTEB model (bge-m3)
    query_vec = bge.encode([state.query], normalize_embeddings=True)[0]

    # Step 2: Encode the retrieved chunks
    retrieved_vecs = bge.encode(texts, normalize_embeddings=True)

    # Step 3: Compute cosine similarity for each chunk
    sims = np.dot(retrieved_vecs, query_vec)

    # Step 4: Derive relevance labels (1 if above threshold, else 0)
    relevance = (sims >= relevance_threshold).astype(int)

    # Step 5: Compute ranking metrics (scores-predicted value vs relevance)
    preds = np.array([r["score"] for r in results_k]) if use_qdrant_scores else sims
    ndcg = ndcg_score([relevance], [preds])
    
    if np.any(relevance == 1):
        first_relevant_idx = int(np.argmax(relevance == 1))
        reciprocal_rank = 1.0 / (first_relevant_idx + 1)
    else:
        reciprocal_rank = 0.0

    print(f"nDCG@{top_k}: {ndcg:.3f}, MRR: {reciprocal_rank:.3f}")

    # Step 6: Decision logic
    if ndcg >= ndcg_threshold or reciprocal_rank >= mrr_threshold:
        print("Retrieval ranking is satisfactory. Proceeding to generation.")
        print("\n\033[1;42m--- End Retrieval Evaluation Process ---\033[0m")
        return "generate"
    else:
        print("Retrieval ranking is poor. Triggering query rewrite.")
        print("\n\033[1;42m--- End Retrieval Evaluation Process ---\033[0m")
        return "rewrite"


### Rewrite Query to Improve Retrieval Quality

In [18]:
def rewrite_query(state: RAGState) -> RAGState:
    # Use Llama3 to rephrase the query semantically while keeping intent.
    print("\n\033[1;42m--- Start Query Rewriting ---\033[0m")
    prompt = f"""
    You are a query rewriter for a retrieval system.
    Rephrase the following query to improve retrieval quality
    without changing its meaning or intent.

    Query:
    "{state.query}"
    """

    new_query = llama3.generate(prompt)  # pseudo-call
    new_query = (new_query or "").strip()

    print(f"""
          Rewritten query (attempt {state.retry_count + 1}):
          "{new_query}"
          """)
    state.query = new_query
    print("\n\033[1;42m--- End Query Rewriting ---\033[0m")
    return state

### Retrieval Loop

Loop stops when number of rewrites = 5 to limit the response time and endless querying.

In [19]:
def retrieval_loop(state: RAGState, max_retries: int = 5, top_k: int = 5) -> RAGState:
    # Full retrieval + evaluation + rewrite loop to prevent hallucination.
    while state.retry_count <= max_retries:
        print(f"\n[Attempt {state.retry_count + 1}] Retrieving and evaluating...")
        
        # Step 1: Retrieve
        state = retrieve_from_qdrant(state, top_k=top_k)

        # Step 2: Evaluate
        result = evaluate_retrieval_ranked(state, top_k=top_k)

        if result == "generate":
            print("Retrieval sufficient to proceed LLM generation.")
            return state  # Pass to generation stage

        # Step 3: If evaluation fails, rewrite query
        if state.retry_count < max_retries:
            print("Rewriting query and retrying retrieval...")
            state = rewrite_query(state)
            state.retry_count += 1
        else:
            print("Retrieval failed after max attempts. No relevant info found.")
            state.answer = "I'm sorry, but I couldn’t find relevant information about Paul from the database."
            return state
    return state

## Generate Answer

In [20]:
llama3 = Ollama(model="llama3", temperature=0.2)

  llama3 = Ollama(model="llama3", temperature=0.2)


In [21]:
def generate_answer(state: RAGState, model=llama3, max_context: int = 5) -> RAGState:
    """
    Generate an answer using retrieved chunks as context.
    Uses a local Llama3 model (via Ollama) for answer generation.
    """

    print("\n\033[1;42m--- Start Generating Answer ---\033[0m")
    # Step 1: Retrieve top-k context chunks (from previous retrieval step)
    if not state.results:
        print("No retrieval results found.")
        state.answer = "I'm sorry, but I couldn’t find relevant information about Paul from the database."
        state.status = "no_result"
        return state

    # Limit context to top N results for efficiency
    # Prevent dict that is not text
    top_contexts_raw = [r.get("text", "") for r in state.results[:max_context]]
    top_contexts = [
        (c.get("text", "") if isinstance(c, dict) else c) for c in top_contexts_raw
    ]

    combined_context = "\n\n".join(top_contexts)

    # Step 2: Build a system + user prompt
    prompt = textwrap.dedent(f"""
    You are a precise and concise AI assistant specialized in retrieval-augmented generation.
    Use the following context extracted from trusted documents to answer the user's query accurately.
    If the context does not contain sufficient information, clearly say so without hallucinating.

    --- Context ---
    {combined_context}

    --- Question ---
    {state.query}

    --- Instruction ---
    1. Base your answer only on the given context.
    2. DO NOT invent facts not present in the documents(context).
    3. If unsure, say "I'm sorry, but I couldn’t find relevant information about Paul from the database."
    4. Return your final answer clearly and concisely.
    """)

    try:
        response = model.invoke(prompt)
        answer_text = response.strip()
        state.status = "success"
    except Exception as e:
        print(f"Failed to connect GenAI Model: {e}")
        state.answer = """
                Error: failed to generate answer due to model connection or runtime issue.
                Please contact Paul for resolving technical issue.
            """
        state.status = "error"
        return state  # Stop pipeline here if error
        

    # Step 3: Store generated answer
    state.answer = answer_text

    # Step 4: Log summary
    print(" ")
    print("\n====== Generated Answer ======")
    print(" ")
    print(answer_text)
    print(" ")
    print("================================\n")
    print(" ")

    print("\n\033[1;42m--- End Generating Answer ---\033[0m")
    return state


### Answer Evaluation

#### Evaluation Metrics
1. Semantic Evaluation using DeepEval Faithfulness and Answer Relevancy
  - Faithfulness: Is answer based on the context retrieved by the retrieval model?
  - Answer Relevancy: Is answer relevant to the user query(question)?

2. Liguistic / Ethical Evaluation using custom metrics
  - Grammar: Grammar Evaluation
  - Fluency: Fluency Evaluation
  - Coherence: Evaluating consistency of logical structure between sentences
  - Conciseness: Evaluating the unnecessary repeatition
  - Toxicity and Bias: Ethical use of words and sentences Evaluation

#### Tier 1 - Semantic Evaluation

In [22]:
faith_model = OllamaModel(model="llama3")
relev_model = OllamaModel(model="llama3:instruct")

faith_metric = FaithfulnessMetric(model=faith_model, threshold=0.7)
relev_metric = AnswerRelevancyMetric(model=relev_model, threshold=0.7)

In [23]:
def evaluate_answer_tier1(state: RAGState, faith_thresh=0.7, relev_thresh=0.7) -> str:
    print("\n\033[1;42m--- Start Evaluating Answer (Tier 1) ---\033[0m")

    if not state.answer or not state.results:
        print("No generated answer or context found for Tier 1 evaluation.")
        return "rewrite"

    # --- Context aggregation ---
    context = "\n".join([r["text"] for r in state.results])
    test_case = LLMTestCase(
        input=state.query,
        actual_output=state.answer,
        retrieval_context=[context],
        expected_output=None
    )

    # --- Step 1. Faithfulness ---
    print("Evaluating Faithfulness...")
    try:
        faith_result = evaluate(test_cases=[test_case], metrics=[faith_metric])
        if hasattr(faith_result, "metrics") and faith_result.metrics:
            faith_score = getattr(faith_result.metrics[0], "score", 0)
            print("\033[1;42m--- 1번으로 해결 ---\033[0m")
        elif hasattr(faith_result, "results") and faith_result.results:
            faith_score = faith_result.results[0].metrics_data[faith_metric.name].score
            print("\033[1;42m--- 2번으로 해결 ---\033[0m")
        else:
            raise AttributeError("Unexpected DeepEval result structure")
    except Exception as e:
        print(f"Faithfulness evaluation failed: {e}")
        return "rewrite"

    print(f"Faithfulness Score: {faith_score:.3f}")
    if faith_score < faith_thresh:
        print("Faithfulness failed → rewrite required.")
        return "rewrite"

    # --- Step 2. Relevancy ---
    print("Faithfulness passed ✓\nEvaluating Relevancy...")
    try:
        relev_result = evaluate(test_cases=[test_case], metrics=[relev_metric])
        if hasattr(relev_result, "metrics") and relev_result.metrics:
            relev_score = getattr(relev_result.metrics[0], "score", 0)
        elif hasattr(relev_result, "results") and relev_result.results:
            relev_score = relev_result.results[0].metrics_data[relev_metric.name].score
        else:
            raise AttributeError("Unexpected DeepEval result structure")
    except Exception as e:
        print(f"Relevancy evaluation failed: {e}")
        return "rewrite"

    print(f"Relevancy Score: {relev_score:.3f}")
    if relev_score < relev_thresh:
        print("Relevancy failed → rewrite required.")
        return "rewrite"

    print("✅ Passed Tier 1 - Semantic Evaluation")
    print("\033[1;42m--- End Evaluating Answer (Tier 1) ---\033[0m")
    return "pass"


In [24]:
# def evaluate_answer_tier1(state: RAGState,
#                           faith_thresh: float = 0.7,
#                           relev_thresh: float = 0.7) -> str:
#     """
#     Tier 1 evaluation for DeepEval v3.x
#     - Checks semantic validity (Faithfulness + Answer Relevancy)
#     - Returns 'pass' or 'rewrite'
#     """
    
#     print("\n\033[1;42m--- Start Evaluating Answer (Tier 1) ---\033[0m")

#     if not state.answer or not state.results:
#         print("No generated answer or context found.")
#         return "rewrite"

#     context = "\n".join([r["text"] for r in state.results])

#     # Define evaluation test case
#     test_case = LLMTestCase(
#         input=state.query,
#         actual_output=state.answer,
#         retrieval_context=[context]
#         expected_output=None
#     )

#     print("Evaluating Tier 1 - Faithfulness & Relevancy (DeepEval v3.x)...")

#     try:
#         result = evaluate(
#             test_cases=[test_case],
#             metrics=[faith_metric, relev_metric],
#             )
        
#     except Exception as e:
#         print(f"DeepEval evaluate() failed: {e}")
#         return "rewrite"

#     faith_score, relev_score = None, None

#     try:
#         if hasattr(result, "metrics") and isinstance(result.metrics, list):
#             for metric in result.metrics:
#                 if "Faith" in metric.name:
#                     faith_score = getattr(metric, "score", 0)
#                 elif "Relev" in metric.name:
#                     relev_score = getattr(metric, "score", 0)
#         else:
#             raise AttributeError("Unexpected DeepEval v3.x result structure")

#     except Exception as ee:
#         print(f"Could not extract metric scores (DeepEval 3.x): {ee}")
#         return "rewrite"

#     faith_score = faith_score or 0.0
#     relev_score = relev_score or 0.0

#     print(f"Faithfulness: {faith_score:.3f} | Relevancy: {relev_score:.3f}")

#     if faith_score >= faith_thresh and relev_score >= relev_thresh:
#         print("Passed Tier 1 - Semantic Evaluation")
#         print("\n\033[1;42m--- End Evaluating Answer (Tier 1) ---\033[0m")
#         return "pass"
#     else:
#         print("Failed Tier 1 - Sending back to rewrite")
#         print("\n\033[1;42m--- End Evaluating Answer (Tier 1) ---\033[0m")
#         return "rewrite"


#### Tier 2 - Linguistic / Ethical Evaluation

In [25]:
class SequentialMetric(BaseMetric):
    def _get_score(self, prompt):
        res = llama3.generate([prompt]).generations[0][0].text
        m = re.search(r"([0-9]*\.?[0-9]+)", res)
        return float(m.group(1)) if m else 0.0

class GrammarMetric(SequentialMetric):
    def measure(self, q,a,c=None): return self._get_score(f"Rate grammar (0-1): {a}")
class FluencyMetric(SequentialMetric):
    def measure(self, q,a,c=None): return self._get_score(f"Rate fluency (0-1): {a}")
class CoherenceMetric(SequentialMetric):
    def measure(self, q,a,c=None): return self._get_score(f"Rate coherence (0-1): {a}")
class ConcisenessMetric(SequentialMetric):
    def measure(self, q,a,c=None): return self._get_score(f"Rate conciseness (0-1): {a}")
class ToxicityMetric(SequentialMetric):
    def measure(self, q,a,c=None): return self._get_score(f"Rate toxicity (0 safe-1 toxic): {a}")
class BiasMetric(SequentialMetric):
    def measure(self, q,a,c=None): return self._get_score(f"Rate bias (0 neutral-1 biased): {a}")

def evaluate_answer_tier2(state: RAGState) -> str:
    print("\n\033[1;42m--- Start Evaluating Answer (Tier 2) ---\033[0m")
    metrics = [
        ("Grammar", GrammarMetric(), 0.75, True),
        ("Fluency", FluencyMetric(), 0.75, True),
        ("Coherence", CoherenceMetric(), 0.75, True),
        ("Conciseness", ConcisenessMetric(), 0.6, True),
        ("Toxicity", ToxicityMetric(), 0.2, False),
        ("Bias", BiasMetric(), 0.3, False),
    ]

    for name, metric, thresh, greater in metrics:
        try:
            score = metric.measure(state.query, state.answer)
            print(f"{name}: {score:.3f}")
            if (greater and score < thresh) or (not greater and score > thresh):
                print(f"{name} failed → rewrite required.")
                return "rewrite"
        except Exception as e:
            print(f"{name} metric failed: {e}")
            return "rewrite"

    print("Passed Tier 2 ✓")
    return "final"

In [26]:
# class GrammarMetric(BaseMetric):
#     def __init__(self, model=None, threshold=0.75):
#         super().__init__()
#         self.model = llama3
#         self.threshold = threshold
#         self.name = "Grammar Quality"

#     def measure(self, question, answer, context=None):
#         prompt = f"""
#         Evaluate the grammar and sentence correctness of this answer.
#         Give a score between 0 (poor) and 1 (perfect grammar).
#         Answer:
#         {answer}
#         """
#         raw_score = self.model.generate([prompt]).generations[0][0].text
#         try:
#             score = float(raw_score.strip())
#         except Exception:
#             match = re.search(r"([0-9]*\.?[0-9]+)", raw_score)
#             score = float(match.group(1)) if match else 0.0
#         return float(score)

# class FluencyMetric(BaseMetric):
#     def __init__(self, model=None, threshold=0.75):
#         super().__init__()
#         self.model = llama3
#         self.threshold = threshold
#         self.name = "Fluency"

#     def measure(self, question, answer, context=None):
#         prompt = f"""
#         Rate the fluency and naturalness of this answer from 0 to 1.
#         Avoid judging content, only evaluate smoothness and readability.
#         Answer:
#         {answer}
#         """
#         raw_score = self.model.generate([prompt]).generations[0][0].text
#         try:
#             score = float(raw_score.strip())
#         except Exception:
#             match = re.search(r"([0-9]*\.?[0-9]+)", raw_score)
#             score = float(match.group(1)) if match else 0.0
#         return float(score)

# class CoherenceMetric(BaseMetric):
#     def __init__(self, model=None, threshold=0.75):
#         super().__init__()
#         self.model = llama3
#         self.threshold = threshold
#         self.name = "Coherence"

#     def measure(self, question, answer, context=None):
#         prompt = f"""
#         Evaluate the logical coherence and flow of the following answer.
#         Give a score from 0 (disjointed) to 1 (highly coherent).
#         Answer:
#         {answer}
#         """
#         raw_score = self.model.generate([prompt]).generations[0][0].text
#         try:
#             score = float(raw_score.strip())
#         except Exception:
#             match = re.search(r"([0-9]*\.?[0-9]+)", raw_score)
#             score = float(match.group(1)) if match else 0.0
#         return float(score)

# class ConcisenessMetric(BaseMetric):
#     def __init__(self, model=None, threshold=0.6):
#         super().__init__()
#         self.model = llama3
#         self.threshold = threshold
#         self.name = "Conciseness"

#     def measure(self, question, answer, context=None):
#         prompt = f"""
#         Evaluate how concise this answer is.
#         Penalize redundancy or overly long explanations.
#         Score from 0 (verbose) to 1 (clear and concise).
#         Answer:
#         {answer}
#         """
#         raw_score = self.model.generate([prompt]).generations[0][0].text
#         try:
#             score = float(raw_score.strip())
#         except Exception:
#             match = re.search(r"([0-9]*\.?[0-9]+)", raw_score)
#             score = float(match.group(1)) if match else 0.0
#         return float(score)

# class ToxicityMetric(BaseMetric):
#     def __init__(self, model=None, threshold=0.2):
#         super().__init__()
#         self.model = llama3
#         self.threshold = threshold
#         self.name = "Toxicity"

#     def measure(self, question, answer, context=None):
#         prompt = f"""
#         Detect any offensive, hateful, or toxic language in the following answer.
#         Give a score from 0 (completely safe) to 1 (very toxic).
#         Answer:
#         {answer}
#         """
#         raw_score = self.model.generate([prompt]).generations[0][0].text
#         try:
#             score = float(raw_score.strip())
#         except Exception:
#             match = re.search(r"([0-9]*\.?[0-9]+)", raw_score)
#             score = float(match.group(1)) if match else 0.0
#         return float(score)

# class BiasMetric(BaseMetric):
#     def __init__(self, model=None, threshold=0.3):
#         super().__init__()
#         self.model = llama3
#         self.threshold = threshold
#         self.name = "Bias"

#     def measure(self, question, answer, context=None):
#         prompt = f"""
#         Evaluate if this answer contains biased or discriminatory language.
#         Give a score from 0 (neutral) to 1 (biased).
#         Answer:
#         {answer}
#         """
#         raw_score = self.model.generate([prompt]).generations[0][0].text
#         try:
#             score = float(raw_score.strip())
#         except Exception:
#             match = re.search(r"([0-9]*\.?[0-9]+)", raw_score)
#             score = float(match.group(1)) if match else 0.0
#         return float(score)

# # --- Tier 2 Evaluation Logic ---
# def evaluate_answer_tier2(state: RAGState) -> str:
#     """
#     Tier 2 evaluation:
#     - Checks Grammar, Fluency, Coherence, Conciseness, Toxicity, Bias
#     - Returns 'final' or 'rewrite'
#     """
#     print("\n\033[1;42m--- Start Evaluating Answer (Tier 2) ---\033[0m")

#     if not state.answer:
#         print("No answer found for Tier 2 evaluation.")
#         return "rewrite"

#     print("Evaluating Tier 2 - linguistic/ethical evaluation...")

#     metrics = [
#         GrammarMetric(), FluencyMetric(), CoherenceMetric(),
#         ConcisenessMetric(), ToxicityMetric(), BiasMetric()
#     ]

#     context = "\n".join([r["text"] for r in state.results]) if state.results else ""
#     question = state.query or "General query"

#     # Evaluate
#     scores = {}
#     for m in metrics:
#         try:
#             score = m.measure(question, state.answer, context)
#             scores[m.name] = score
#         except Exception as e:
#             scores[m.name] = 0.0
#             print(f"Metric {m.name} failed: {e}")

#     # Log scores
#     for k, v in scores.items():
#         print(f"{k}: {v:.3f}")

#     # Determine pass/fail
#     if (scores["Grammar Quality"] >= 0.75 and
#         scores["Fluency"] >= 0.75 and
#         scores["Coherence"] >= 0.75 and
#         scores["Conciseness"] >= 0.6 and
#         scores["Toxicity"] <= 0.2 and
#         scores["Bias"] <= 0.3):
#         print("Passed Tier 2 - Linguistic/Ethical Evaluation")
#         print("\n\033[1;42m--- End Evaluating Answer (Tier 2) ---\033[0m")
#         return "final"
#     else:
#         print("Failed Tier 2 - Sending back to rewrite.")
#         print("\n\033[1;42m--- End Evaluating Answer (Tier 2) ---\033[0m")
#         return "rewrite"


In [27]:
def full_answer_evaluation(state: RAGState) -> RAGState:
    # Tier 1: Faithfulness → Relevancy
    while True:
        result = evaluate_answer_tier1(state)
        if result == "pass":
            print("Proceeding to Tier 2.")
            break
        state = rewrite_answer(state, reason="tier1")

    # Tier 2: Sequential Metrics
    while True:
        result = evaluate_answer_tier2(state)
        if result == "final":
            break
        state = rewrite_answer(state, reason="tier2")

    return state

In [28]:
# def full_answer_evaluation(state: RAGState,
#                            max_semantic_retries: int = 5,
#                            max_linguistic_retries: int = 3) -> RAGState:
#     """
#     Full evaluation pipeline:
#     - Tier 1 + Tier 2
#     - Adaptive retry: max total rewrites = 8
#     """

#     # Tier 1 - Semantic evaluation
#     while state.retry_count < max_semantic_retries:
#         print(f"\n\033[1;44m[Semantic Eval Attempt {state.retry_count + 1}]\033[0m")

#         tier1_result = evaluate_answer_tier1(state)
#         if tier1_result == "pass":
#             print("Passed Tier 1. Moving to Tier 2.")
#             break

#         # Rewrite on fail
#         if tier1_result == "rewrite":
#             state = rewrite_answer(state, reason="semantic")
#         state.retry_count += 1

#     else:
#         print("Tier 1 failed after max retries.")
#         state.answer = (
#             "I'm sorry, but I couldn’t generate a factually accurate and relevant answer. Could you try again?"
#         )
#         return state

#     # Tier 2 - Linguistic/Ethical evaluation
#     linguistic_retries = 0
#     while linguistic_retries < max_linguistic_retries:
#         print(f"\n\033[1;44m[Linguistic Eval Attempt {linguistic_retries + 1}]\033[0m")

#         tier2_result = evaluate_answer_tier2(state)
#         if tier2_result == "final":
#             print("Passed Tier 2. Answer fully verified.")
#             return state

#         # Rewrite & adjust retry counters
#         if tier2_result == "rewrite":
#             state = rewrite_answer(state, reason="linguistic")
#         linguistic_retries += 1

#         # Deduct semantic retry budget
#         if state.retry_count < max_semantic_retries:
#             state.retry_count += 1
#             print(f"Deducted 1 semantic retry (used {state.retry_count}/{max_semantic_retries})")

#         # Abort if total rewrite exceeds 8
#         if state.retry_count >= max_semantic_retries:
#             print("Answer generation total attempts exceeded 8. Aborting.")
#             state.answer = (
#                 "I'm sorry, but I couldn’t refine the answer further without risking factual distortion. Could you try again?"
#             )
#             return state

#     print("Minor linguistic issues remain. Accepting best-effort answer.")
#     return state


In [29]:
def rewrite_answer(state: RAGState, reason: str = "generic") -> RAGState:
    """
    Dynamically rewrites an answer depending on failure reason.
    - Semantic issue → factual or contextual re-grounding
    - Linguistic/Ethical issue → grammar, tone, or style refinement
    """
    print("\n\033[1;42m--- Start Rewriting Answer ---\033[0m")

    if not state.answer:
        print("No existing answer to rewrite.")
        return state

    # --- Semantic Rewriting ---
    if reason == "tier1":
        prompt = f"""
        You are a semantic rewriter for factual accuracy.
        The following answer was generated for a question but failed semantic evaluation.
        Please rephrase it so that it is more faithful to the context and relevant to the question,
        without hallucinating or introducing new information.

        --- Question ---
        {state.query}

        --- Original Answer ---
        {state.answer}

        --- Instruction ---
        1. Focus on factual alignment with the retrieved context.
        2. Keep only verified details; remove speculative or unrelated content.
        3. Maintain a professional, concise tone.
        """

    # --- Linguistic / Ethical Rewriting ---
    elif reason == "tier2":
        prompt = f"""
        You are an answer refinement assistant for linguistic and ethical improvements.
        The following answer was grammatically weak, verbose, or stylistically inconsistent.
        Refine it for grammar, fluency, coherence, conciseness, and remove any biased or unsafe phrasing.

        --- Question ---
        {state.query}

        --- Original Answer ---
        {state.answer}

        --- Instruction ---
        1. Keep factual content unchanged.
        2. Improve grammar, coherence, and readability.
        3. Remove redundancy or biased language.
        4. Return a single improved version.
        """

    # --- Fallback: generic ---
    else:
        prompt = f"""
        Refine the following answer to improve clarity and accuracy without changing meaning.
        --- Question ---
        {state.query}
        --- Answer ---
        {state.answer}
        """

    print(f"Rewriting answer due to {reason} issue...")
    try:
        refined = llama3.invoke(prompt).strip()
        print("\n--- Rewritten Answer ---\n", refined)
        state.answer = refined
        state.status = f"rewritten_{reason}"
    except Exception as e:
        print(f"Failed to rewrite answer: {e}")
        state.status = "rewrite_error"

    print("\n\033[1;42m--- End Rewriting Answer ---\033[0m")
    return state


In [30]:
# Initialize State
state = RAGState()
state.query = input("Enter your question: ").strip()
print(f"""
      Generating Answer for Your question:
      {state.query}
    """)

# Load & Chunk PDFs (only once unless documents change)
print("\n\033[1;42m--- Loading and Chunking Documents ---\033[0m")
state = load_and_chunk(state)

# Embed and Store in Qdrant (only once after load)
print("\n\033[1;42m---Embedding and Storing Chunks in Qdrant ---\033[0m")
state = embed_and_store(state)


      Generating Answer for Your question:
      What did this person do when serving military?
    

[1;42m--- Loading and Chunking Documents ---[0m

[1;42m--- Start Loading Docs and Chunking ---[0m

[1;42m---Loaded 21 pages and created 37 chunks---[0m

[1;42m---Embedding and Storing Chunks in Qdrant ---[0m

[1;42m--- Start Embedding and Storing ---[0m
Encoding with Qwen3-Embedding-0.6B ...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding with bge-m3 ...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserting 37 vectors → pkyoo_personal_docs_dualvec
Upsert finished.

[1;42m--- End Embedding and Storing process ---[0m


In [31]:
# state = RAGState()
# state.query = input("Enter your question: ").strip()

In [None]:
# Retrieval Loop — uses bge-m3 for retrieval, evaluation & rewriting if needed
print("\n\033[1;42m--- Starting Retrieval Loop ---\033[0m")
state = retrieval_loop(state)

# Answer Generation using Llama3
print("\n\033[1;42m--- Generating Answer ---\033[0m")
state = generate_answer(state)

# Check status before evaluate
# Dual-layer Evaluation (DeepEval Tier1 + Custom Tier2)
if getattr(state, "status", None) == "error":
    print("\n Model connection failed. Evaluation aborted.")
    print("Please contact Paul for resolving technical issue.\n")
else:
    print("\n\033[1;42m--- Evaluating Generated Answer ---\033[0m")
    state = full_answer_evaluation(state)

# Final Output (Answer to the question of user query)
print("\n=====================================================")
print("")
print(state.answer)
print("")
print("=====================================================\n")



[1;42m--- Starting Retrieval Loop ---[0m

[Attempt 1] Retrieving and evaluating...

[1;42m--- Start Retrieval from Qdrant Process ---[0m
Generating query embedding using bge-m3
Searching Qdrant collection 'pkyoo_personal_docs_dualvec' ...
Retrieved 5 chunks.

[1;42m--- End Retrieval from Qdrant process ---[0m

[1;42m--- Start Retrieval Evaluation Process ---[0m
Evaluating retrieval quality using ranking metrics...
nDCG@5: 1.000, MRR: 1.000
Retrieval ranking is satisfactory. Proceeding to generation.

[1;42m--- End Retrieval Evaluation Process ---[0m
Retrieval sufficient to proceed LLM generation.

[1;42m--- Generating Answer ---[0m

[1;42m--- Start Generating Answer ---[0m
 

 
According to the context, when serving as a Military Police Sergeant (2017-2019), this person:

* Led a squad for base defense and search missions
* Conducted security patrols
* Trained new recruits
* Managed a team of over 15 personnel
* Trained 15+ subordinates and ensured 0 security incidents



Output()