In [1]:
#pdf which is used https://www.barc.gov.in/barc_nl/2025/20250102.pdf

In [None]:
# Install required packages for PDF processing and RAG pipeline
!pip install --upgrade "unstructured[pdf]" \
    "llama-index-core>=0.10.0,<0.11.0" \
    "llama-index-embeddings-huggingface" \
    "llama-index-llms-openai" \
    "openai<2.0.0" \
    "transformers" \    
    "torch" \
    "sentence-transformers" \
    "faiss-cpu" \
    "ragas" > /dev/null 2>&1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
from unstructured.partition.pdf import partition_pdf
from llama_index.core import Document  # Import LlamaIndex Document class
from llama_index.core.node_parser import SentenceSplitter
import os

def is_english(text):
    """Check if text contains only ASCII (English) characters."""
    try:
        text.encode('ascii')
        return True
    except UnicodeEncodeError:
        return False

def extract_pdf_text(pdf_path, output_dir="/kaggle/working/extracted_data"):
    """Extract only ASCII (English) text from PDF."""
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Extracting English text from {pdf_path}...")
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    documents = []
    for element in elements:
        element_dict = element.to_dict()
        if element_dict["type"] not in ["Image", "Table"]:
            text = element_dict["text"]
            if is_english(text):
                documents.append(text)
    
    text_path = os.path.join(output_dir, "extracted_english_text.txt")
    with open(text_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(documents))
    
    print(f"Extracted {len(documents)} English text sections. Saved to {text_path}")
    return documents

# Example usage
pdf_path = "/kaggle/input/barc-sample/20250102.pdf"
documents = extract_pdf_text(pdf_path)



Extracting English text from /kaggle/input/barc-sample/20250102.pdf...


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

2025-06-17 16:32:06.296930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750177926.545994      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750177926.622215      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Extracted 802 English text sections. Saved to /kaggle/working/extracted_data/extracted_english_text.txt


In [7]:
# --- Split into chunks ---
def split_documents(text_documents, chunk_size=512, chunk_overlap=50):
    # Convert strings to LlamaIndex Document objects
    chunk_documents = [Document(text=text) for text in documents]
    
    # Initialize SentenceSplitter
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    # Split into nodes
    nodes = splitter.get_nodes_from_documents(chunk_documents)
    
    print(f"✅ Split into {len(nodes)} chunks")
    print(f"📦 Sample chunk:\n{nodes[0].text[:300]}...\n")
    return nodes

nodes = split_documents(documents)

import os
os.environ["OPENAI_API_KEY"] = "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" 


✅ Split into 802 chunks
📦 Sample chunk:
Nuclear SciTech Leading Sustainable Development...



In [None]:
# --- Your previous code for embeddings and Ragas evaluation ---
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.schema import TextNode
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset
import pandas as pd
import numpy as np

# Define the embeddings to use
embedding_models = {
    "BAAI/bge-small-en-v1.5": HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    "sentence-transformers/all-MiniLM-L6-v2": HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    "sentence-transformers/paraphrase-MiniLM-L3-v2": HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2"),
}

# Define the k values (number of retrieved chunks) to test
k_values = [3, 5, 7]

# Define a sample query for evaluation
query = "What is CAP?" # You might want to make this more specific to your document content

# Prepare a list to store evaluation results
evaluation_results = []
retrieved_chunks_data = []

# Convert nodes to TextNode objects for indexing
llama_nodes = [TextNode(text=node.text, id_=f"node_{i}") for i, node in enumerate(nodes)]

for embed_name, embed_model in embedding_models.items():
    print(f"\n--- Evaluating with Embedding Model: {embed_name} ---")
    Settings.embed_model = embed_model
    
    # Create an index from the nodes
    # We rebuild the index for each embedding model to ensure correct embeddings are used
    vector_index = VectorStoreIndex(llama_nodes)
    
    # IMPORTANT: Explicitly set LLM to None or provide a dummy LLM
    # if you don't intend to use OpenAI or any LLM for query answering at this stage.
    # If you later want to use an LLM, you'd configure it here.
    query_engine = vector_index.as_query_engine(
        similarity_top_k=max(k_values),
        llm=None # This prevents LlamaIndex from trying to load a default OpenAI LLM
    )

    for k in k_values:
        print(f"\n--- Testing with k = {k} ---")
        
        # Retrieve chunks
        retrieval_response = query_engine.retrieve(query)
        retrieved_chunks = [node.text for node in retrieval_response[:k]]
        retrieved_ids = [node.node_id for node in retrieval_response[:k]]

        print(f"Retrieved {len(retrieved_chunks)} chunks for k={k} with {embed_name}:")
        for i, chunk in enumerate(retrieved_chunks):
            print(f"Chunk {i+1} (ID: {retrieved_ids[i]}):\n{chunk}\n---")
            retrieved_chunks_data.append({
                "embedding_model": embed_name,
                "k_value": k,
                "chunk_index": i + 1,
                "chunk_id": retrieved_ids[i],
                "chunk_content": chunk
            })

        # --- RAGAS Evaluation ---
        # For Ragas, we need to mock a dataset.
        # This is a simplified Ragas setup. For full evaluation, you'd need ground truth answers and contexts.
        # Here, we're primarily evaluating context quality and relevance given a query and the retrieved chunks.
        
        # To evaluate faithfulness and answer_relevancy with Ragas, you typically need a generated answer.
        # Since we're focusing on retrieval quality here, we'll generate a dummy answer for demonstration.
        # In a real RAG pipeline, this would be the LLM's answer based on the retrieved context.
        dummy_answer = "the term cap refers to cold atmospheric pressure but based on the provided text it seems more likely that cap actually means cold atmospheric plasma which is a type of gas discharge that produces highly reactive chemical species capable of destroying microorganisms without causing thermal damage to living tissue its used here as part of a medical device designed to reduce microbial loads on dental surface through disinfection"
        
        # Prepare data for Ragas Dataset
        # The 'contexts' field expects a list of lists of strings
        ragas_data = {
            'question': [query],
            'answer': [dummy_answer],
            'contexts': [[chunk for chunk in retrieved_chunks]], # List of retrieved chunks
            'reference': ['Cold Atmospheric Plasma (CAP) is a non-thermal, cost-effective technology with diverse applications including surface sterilization, material modification, and biomedical treatments. It operates at room temperature and generates reactive species capable of inactivating microbes without causing thermal damage to tissues.'] # Optional, for more comprehensive evaluation
        }
        
        dataset = Dataset.from_dict(ragas_data)

        print(f"Starting Ragas evaluation for {embed_name} with k={k}...")
        try:
            score = evaluate(
                dataset, 
                metrics=[context_precision, context_recall], # Focusing on context metrics
                # For faithfulness and answer_relevancy, you'd also need a robust way to generate 'answer' based on 'contexts' and 'question'
                # and potentially 'ground_truth' for comprehensive evaluation.
            )
            df = score.to_pandas()
            
            # Store results
            evaluation_results.append({
                "embedding_model": embed_name,
                "k_value": k,
                "context_precision": df['context_precision'].iloc[0] if 'context_precision' in df.columns else np.nan,
                "context_recall": df['context_recall'].iloc[0] if 'context_recall' in df.columns else np.nan,
            })
        except Exception as e:
            print(f"Error during Ragas evaluation for {embed_name} with k={k}: {e}")
            evaluation_results.append({
                "embedding_model": embed_name,
                "k_value": k,
                "context_precision": np.nan,
                "context_recall": np.nan,
            })

print("\n--- Summary of All RAGAS Evaluation Results ---")
results_df = pd.DataFrame(evaluation_results)
results_df



--- Evaluating with Embedding Model: BAAI/bge-small-en-v1.5 ---

--- Testing with k = 3 ---
Retrieved 3 chunks for k=3 with BAAI/bge-small-en-v1.5:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_405):
The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical emission spectroscopy confirmed the presence of reactive species, such as excited Ar atoms, atomic oxygen, and OH radicals, which play a crucial role in bacterial inactivation. Experimental results revealed that 

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Testing with k = 5 ---
Retrieved 5 chunks for k=5 with BAAI/bge-small-en-v1.5:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_405):
The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical emission spectroscopy confirmed the presence of reactive species, such as excited Ar atoms, atomic oxygen, and OH radicals, which play a crucial role in bacterial inactivation. Experimental results revealed that a 5-minute CAP treatment achieved an average microbial reduction 

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Testing with k = 7 ---
Retrieved 7 chunks for k=7 with BAAI/bge-small-en-v1.5:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_405):
The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical emission spectroscopy confirmed the presence of reactive species, such as excited Ar atoms, atomic oxygen, and OH radicals, which play a crucial role in bacterial inactivation. Experimental results revealed that a 5-minute CAP treatment achieved an average microbial reduction 

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Evaluating with Embedding Model: sentence-transformers/all-MiniLM-L6-v2 ---

--- Testing with k = 3 ---
Retrieved 3 chunks for k=3 with sentence-transformers/all-MiniLM-L6-v2:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_387):
Fig.1: (a) Schematic of cold plasma device, (b) depicts a photograph of the extracted teeth stored in saline within a jar, (c) a typical image captures the CAP treatment procedure being performed on one of the teeth
---
Chunk 3 (ID: node_402):
In Fig.5, the percentage reduction in microbial count following CAP treatment for TP 

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Testing with k = 5 ---
Retrieved 5 chunks for k=5 with sentence-transformers/all-MiniLM-L6-v2:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_387):
Fig.1: (a) Schematic of cold plasma device, (b) depicts a photograph of the extracted teeth stored in saline within a jar, (c) a typical image captures the CAP treatment procedure being performed on one of the teeth
---
Chunk 3 (ID: node_402):
In Fig.5, the percentage reduction in microbial count following CAP treatment for TP 1 and TP 2 is illustrated. The findings indicate that a 5-minute CAP treatment ad

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Testing with k = 7 ---
Retrieved 7 chunks for k=7 with sentence-transformers/all-MiniLM-L6-v2:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_387):
Fig.1: (a) Schematic of cold plasma device, (b) depicts a photograph of the extracted teeth stored in saline within a jar, (c) a typical image captures the CAP treatment procedure being performed on one of the teeth
---
Chunk 3 (ID: node_402):
In Fig.5, the percentage reduction in microbial count following CAP treatment for TP 1 and TP 2 is illustrated. The findings indicate that a 5-minute CAP treatment ad

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Evaluating with Embedding Model: sentence-transformers/paraphrase-MiniLM-L3-v2 ---

--- Testing with k = 3 ---
Retrieved 3 chunks for k=3 with sentence-transformers/paraphrase-MiniLM-L3-v2:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_405):
The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical emission spectroscopy confirmed the presence of reactive species, such as excited Ar atoms, atomic oxygen, and OH radicals, which play a crucial role in bacterial ina

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Testing with k = 5 ---
Retrieved 5 chunks for k=5 with sentence-transformers/paraphrase-MiniLM-L3-v2:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_405):
The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical emission spectroscopy confirmed the presence of reactive species, such as excited Ar atoms, atomic oxygen, and OH radicals, which play a crucial role in bacterial inactivation. Experimental results revealed that a 5-minute CAP treatment achieved an avera

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Testing with k = 7 ---
Retrieved 5 chunks for k=7 with sentence-transformers/paraphrase-MiniLM-L3-v2:
Chunk 1 (ID: node_403):
On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.
---
Chunk 2 (ID: node_405):
The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical emission spectroscopy confirmed the presence of reactive species, such as excited Ar atoms, atomic oxygen, and OH radicals, which play a crucial role in bacterial inactivation. Experimental results revealed that a 5-minute CAP treatment achieved an avera

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]


--- Summary of All RAGAS Evaluation Results ---


Unnamed: 0,embedding_model,k_value,context_precision,context_recall
0,BAAI/bge-small-en-v1.5,3,0.583333,0.0
1,BAAI/bge-small-en-v1.5,5,0.638889,1.0
2,BAAI/bge-small-en-v1.5,7,0.679167,1.0
3,sentence-transformers/all-MiniLM-L6-v2,3,0.0,0.5
4,sentence-transformers/all-MiniLM-L6-v2,5,0.25,1.0
5,sentence-transformers/all-MiniLM-L6-v2,7,0.166667,1.0
6,sentence-transformers/paraphrase-MiniLM-L3-v2,3,0.583333,0.5
7,sentence-transformers/paraphrase-MiniLM-L3-v2,5,0.583333,0.5
8,sentence-transformers/paraphrase-MiniLM-L3-v2,7,0.583333,0.5
