In [1]:
import os 
import pandas as pd
from pdf_parser import*  
from embeddings import*
from vector_db_handler import*
from ragas_module import *
from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI
from ragas.llms import LangchainLLMWrapper

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:

base = os.getenv("base_folder")
file_name = os.getenv("file_name")



# Parse and Chunk PDF file.

In [3]:
chunker = ParsingAndChunkingHandler(file_name,base)
chunker

<pdf_parser.ParsingAndChunkingHandler at 0x28a60009b20>

In [4]:
chunker.parse_pdf_to_elements()



The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [5]:
text_to_ignore = [
    "UNCLASSIFIED",
    "RESTRICTED",
    "CB-AGS Handbook",
    "NORMAL PROCEDURES",
    "NORMAL PROCEDURE", # Handling slight variations
    "16814",
    "Decrmber 2024", # Note: Copied the typo from your PDF content
    "December 2024",
    "THIS PAGE INTENTIONALLY LEFT BLANK",
    "Page NP"
]
chunker.clean_redundant_elements(text_to_ignore)

   -> Removed 43 noise elements out of 81.


In [6]:
## TODO:: add the remaining parameters

parsed_chunks = chunker.find_and_chunk_title_wise(chunker.cleaned_elements)

2. Chunking filtered elements...


Assigning IDs to Chunks: 100%|██████████| 5/5 [00:00<?, ?it/s]


In [7]:
processed_chunks = [chunker.handle_table_in_chunk_for_embedding(chunk) for chunk in parsed_chunks]



# Embedding Model

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from embeddings import *


EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
EMBEDDING_DIMENSION = int(os.getenv("EMBEDDING_DIMENSION"))

embedding_model = EmbeddingModel(EMBEDDING_MODEL_NAME, EMBEDDING_DIMENSION)


In [9]:



analysis = embedding_model.test_chunks_tokens_and_embedding_model_tokens(processed_chunks)
analysis

Token indices sequence length is longer than the specified maximum sequence length for this model (1731 > 512). Running this sequence through the model will result in indexing errors


Analyzing 5 nodes against model limit: 512 tokens


Unnamed: 0,Node ID,Status,Token Count,Excess Tokens,Content Snippet
0,0,⚠️ TRUNCATED,1731,1219,SECTION CONTEXT: Table of Contents Before FII...
1,1,✅ SAFE,109,0,SECTION CONTEXT: Before Flight 1. Mission Pla...
2,2,✅ SAFE,21,0,SECTION CONTEXT: Before Power up On UTS serve...
3,3,⚠️ TRUNCATED,775,263,SECTION CONTEXT: N Verify the following MS C.B...
4,4,⚠️ TRUNCATED,661,149,SECTION CONTEXT: 3. _ Ensure the following MS ...


# Creating Vector DB

In [10]:


db_handler = DbHandler(embedding_model, EMBEDDING_DIMENSION)

faiss_index = db_handler.build_faiss_L2_index(processed_chunks)
faiss_index


Generating embeddings for documents...
FAISS Index built with 5 documents.


<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000028981A0FF90> >

# Initiliazing RAGAs Module

In [11]:


from langchain_google_genai import ChatGoogleGenerativeAI
from ragas.llms import LangchainLLMWrapper
from utils import init_judge_llm
""" 
Judge LLM Initialization
"""

LLM_judge_model_type = os.getenv("LLM_JUDGE_MODEL_TYPE")
# judge_llm = init_judge_llm(LLM_judge_model_type)


os.environ["GOOGLE_API_KEY"] = "AIzaSyCRyx1T94s86dY8GWMRpaJBIUTCWiiWpRw"

judge_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.1,
)



# ragas_llm = LangchainLLMWrapper(judge_llm)
ragas_llm = judge_llm



""" 
Embedding Model Initialization
"""

ragas_embeddings = LangchainEmbeddingsWrapper(embedding_model.embedding_model)


  ragas_embeddings = LangchainEmbeddingsWrapper(embedding_model.embedding_model)


In [12]:
rag_module = RagasTestingModule(ragas_embeddings, ragas_llm, db_handler.faiss_index)

# Testing

In [13]:
def build_knowledge_graph( chunks, processed_chunks):
    ragas_nodes = []

    # We iterate through both the original chunks (for metadata) 
    # and your processed text (for content) simultaneously.
    for raw_chunk, clean_text in zip(chunks, processed_chunks):
        
        # Extract filename safely
        source_meta = raw_chunk.metadata.to_dict()
        filename = source_meta.get('filename') or source_meta.get('file_directory') or "manual.pdf"
        page_num = source_meta.get('page_number', 0)
        doc_id = source_meta.get('id', 'unknown')
        print(doc_id)

        # Create the Node manually
        # This avoids the "AttributeError: from_langchain_document" bug
        node = Node(
            type=NodeType.DOCUMENT,
            properties={
                "page_content": clean_text,  # This contains your "SECTION CONTEXT: ..." string
                "filename": filename,
                "page_number": page_num,
                "document_id": doc_id
            }
        )
        ragas_nodes.append(node)

    rag_module.ragas_nodes = ragas_nodes
    # Initialize the Graph
    kg = KnowledgeGraph(nodes=ragas_nodes)
    print("Knowledge Graph built successfully.")
    return kg

In [14]:



kg = rag_module.build_knowledge_graph(parsed_chunks, processed_chunks)

Knowledge Graph built successfully.


In [15]:
prompt_template = """
You are an expert Technical QA Analyst creating a dataset to evaluate a RAG retrieval system.
Your goal is to generate realistic search scenarios based on the text provided.

CONTEXT TEXT:
{text}

INSTRUCTIONS:
1. Analyze the text to identify key procedures, component states, warnings, or specifications.
2. Generate 5 to 10 DISTINCT search scenarios.
3. For each scenario, output a JSON object with:
   - "query": A specific keyword-based search query a user would type (e.g., "EFB power requirements").
   - "reference": The exact sentence or short paragraph from the text that answers the query. (This is the Ground Truth Answer).

OUTPUT FORMAT:
Return ONLY a raw JSON object containing a list under the key "test_cases".
{{
    "test_cases": [
        {{
            "query": "...",
            "reference": "..."
        }},
        ...
    ]
}}
"""

def get_teacher_values_for_keywords_retrieval_test2(prompt_template):

                # ==========================================
        # 3. GENERATION LOOP (The Exam - Scaled Up)
        # ==========================================
        print("\nStarting Generation Loop...")

        test_cases = []


        for i, node in enumerate(rag_module.ragas_nodes):
            context = node.properties["page_content"]
            
            # if len(context) < 50:
            #     print(f"  - Node {i}: Skipped (Too short)")
            #     continue

            print(f"  - Processing Node {i} (Generating ~10 cases)...")

            # 1. Call Gemini
            # We increase output tokens slightly to ensure the full list fits
            response = rag_module.ragas_judge_llm.invoke(prompt_template.format(text=context[:2000]))            
            # 2. Clean Response
            clean_str = response.content.replace("```json", "").replace("```", "").strip()    
            # 3. Parse JSON
            data = json.loads(clean_str)
            
            # 4. Validate & Save List
            current_batch_count = 0
            if "test_cases" in data and isinstance(data["test_cases"], list):
                for item in data["test_cases"]:
                    if "query" in item and "reference" in item:
                        test_cases.append({
                            "user_input": item["query"],
                            "reference": item["reference"],
                            "source_context": context,
                            "page": node.properties["page_number"],
                            "ids": node.properties["document_id"]
                        })
                        current_batch_count += 1
                print(f"    -> Successfully added {current_batch_count} test cases.")
            else:
                print(f"    -> Failed (JSON missing 'cases' list). Raw keys: {data.keys()}")
                    
            # except Exception as e:
            #     print(f"    -> Error generating for Node {i}: {e}")
            
            # Sleep to be polite to API limits
            time.sleep(2)
            
        df_results = pd.DataFrame(test_cases)
        return df_results


evaluation_dataset = rag_module.get_teacher_values_for_keywords_retrieval_test(prompt_template)
# evaluation_dataset


Starting Generation Loop...
  - Processing Node 0 (Generating ~10 cases)...
    -> Successfully added 10 test cases.
  - Processing Node 1 (Generating ~10 cases)...
    -> Successfully added 9 test cases.
  - Processing Node 2 (Generating ~10 cases)...
    -> Successfully added 7 test cases.
  - Processing Node 3 (Generating ~10 cases)...
    -> Successfully added 9 test cases.
  - Processing Node 4 (Generating ~10 cases)...
    -> Successfully added 10 test cases.


In [31]:
def test_keywords_retrieval_faiss(scenarios, k =1):
    # ==========================================
    # STEP 3: Take the Exam (Run Retrieval)
    # ==========================================
    print("Step 3: Running the Exam (Retrieving contexts for queries)...")

    # We need to add a "retrieved_contexts" column to your test dataset
    # test_questions = scenarios[scenarios['page']!=1]["user_input"].tolist()
    test_questions = scenarios["user_input"].tolist()
    ground_truths = scenarios["reference"].tolist()
    ground_truths_ids = scenarios["ids"].tolist()

    retrieved_contexts = []
    retrieved_ids = []
    distances_found = []

    doc_texts = [node.properties["page_content"] for node in rag_module.ragas_nodes]
    doc_ids = [node.properties["document_id"] for node in rag_module.ragas_nodes]

    for query in test_questions:
        # 1. Embed the query using the same embedding model
        query_embedding = rag_module.ragas_embeddings.embed_query(query)
        query_embedding_np = np.array([query_embedding]).astype("float32")
        
        # 2. Search the FAISS index (Retrieve top 1 result)
        distances, indices = rag_module.index.search(query_embedding_np, 1)
        
        # 3. Extract the actual text based on the returned indices
        # indices[0] contains the list of IDs found for the first query
        found_texts = [doc_texts[idx] for idx in indices[0]]
        found_ids = [doc_ids[idx] for idx in indices[0]]

        dist_found = [distances[0][i] for i in range(len(distances[0]))]

        retrieved_contexts.append(found_texts)
        retrieved_ids.append(found_ids)
        distances_found.append(dist_found)  

        # Create the dataset Ragas expects
        evaluation_data = {
            "user_input": test_questions,      # What the user typed
            "reference": ground_truths, # The correct answer/fact
            "reference_ids":ground_truths_ids, #The correct doc ID      
            "retrieved_contexts": retrieved_contexts, # What your system found
            "retrieved_ids":retrieved_ids,
            "distances": distances_found
        }

    ragas_dataset = Dataset.from_dict(evaluation_data)

    return ragas_dataset

evaluation_results = test_keywords_retrieval_faiss(evaluation_dataset, k=1)

Step 3: Running the Exam (Retrieving contexts for queries)...


In [16]:
evaluation_results = rag_module.test_keywords_retrieval_faiss(evaluation_dataset, k=1)

Step 3: Running the Exam (Retrieving contexts for queries)...


In [None]:
evaluation_results.to_pandas()

In [None]:
print("Step 4: Grading with Ragas Metrics...")

from ragas.metrics import context_precision, context_recall

# We define the metrics we want:
# 1. Context Recall: Did we retrieve the correct information? (Sensitivity)
# 2. Context Precision: Was the correct information at the top? (Ranking)

metrics = [context_recall, context_precision]

rag_module.evaluate_metrics_for_test(metrics, evaluation_results)

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 16.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10, model: gemini-2.5-flash
Please retry in 44.214365167s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value