In [1]:
#pdf which is used https://www.barc.gov.in/barc_nl/2025/20250102.pdf

In [2]:
# Install required packages for PDF processing and RAG pipeline
# Mute pip and apt-get install output
!pip install "unstructured[pdf]" "llama-index-core>=0.10.0" "llama-index-embeddings-huggingface" transformers torch sentence-transformers faiss-cpu > /dev/null 2>&1
!apt-get install -y poppler-utils > /dev/null 2>&1


In [3]:
from unstructured.partition.pdf import partition_pdf
from llama_index.core import Document  # Import LlamaIndex Document class
from llama_index.core.node_parser import SentenceSplitter
import os

def is_english(text):
    """Check if text contains only ASCII (English) characters."""
    try:
        text.encode('ascii')
        return True
    except UnicodeEncodeError:
        return False

def extract_pdf_text(pdf_path, output_dir="/kaggle/working/extracted_data"):
    """Extract only ASCII (English) text from PDF."""
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Extracting English text from {pdf_path}...")
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    documents = []
    for element in elements:
        element_dict = element.to_dict()
        if element_dict["type"] not in ["Image", "Table"]:
            text = element_dict["text"]
            if is_english(text):
                documents.append(text)
    
    text_path = os.path.join(output_dir, "extracted_english_text.txt")
    with open(text_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(documents))
    
    print(f"Extracted {len(documents)} English text sections. Saved to {text_path}")
    return documents

# Example usage
pdf_path = "/kaggle/input/barc-sample/20250102.pdf"
documents = extract_pdf_text(pdf_path)



Extracting English text from /kaggle/input/barc-sample/20250102.pdf...


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

2025-06-14 18:07:24.801496: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749924445.062330      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749924445.136376      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Extracted 802 English text sections. Saved to /kaggle/working/extracted_data/extracted_english_text.txt


In [4]:
# --- Split into chunks ---
def split_documents(text_documents, chunk_size=512, chunk_overlap=50):
    # Convert strings to LlamaIndex Document objects
    chunk_documents = [Document(text=text) for text in documents]
    
    # Initialize SentenceSplitter
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    # Split into nodes
    nodes = splitter.get_nodes_from_documents(chunk_documents)
    
    print(f"✅ Split into {len(nodes)} chunks")
    print(f"📦 Sample chunk:\n{nodes[0].text[:300]}...\n")
    return nodes

nodes = split_documents(documents)

✅ Split into 802 chunks
📦 Sample chunk:
Nuclear SciTech Leading Sustainable Development...



In [5]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter


# Initialize embedding model
embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"  # Efficient embedding model
)
print("\n Creating vector store index...")
# Create vector index
vector_index = VectorStoreIndex(
    nodes, 
    embed_model=embed_model,
    show_progress=True  # Visual progress bar
)
print("✅ Vector store created successfully!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


 Creating vector store index...


Generating embeddings:   0%|          | 0/802 [00:00<?, ?it/s]

✅ Vector store created successfully!


In [6]:
# Verify index
print(f"Index contains {len(vector_index.docstore.docs)} documents")
print("Sample indexed document ID:", list(vector_index.docstore.docs.keys())[0])

Index contains 802 documents
Sample indexed document ID: f59ebd33-5bdd-477f-94ce-4c974948696b


In [7]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Get Hugging Face token and login
hf_token = UserSecretsClient().get_secret("HF_TOKEN")
login(token=hf_token)
#deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
# Load DeepSeek model and tokenizer
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="cpu",  # Change to "auto" or "cuda" if using GPU
    token=hf_token
)


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [8]:
def rag_answer(query: str) -> str:
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity

    # Step 1: Get query embedding
    query_embedding = embed_model.get_text_embedding(query)

    # Step 2: Retrieve top-k nodes
    retriever = vector_index.as_retriever(similarity_top_k=4)
    retrieved_nodes = retriever.retrieve(query)

    print(f"\n🔍 Retrieved Chunks with Cosine Similarity Scores for: \"{query}\"")
    for i, node in enumerate(retrieved_nodes):
        node_id = getattr(node, "node_id", f"node_{i}")
        node_embedding = embed_model.get_text_embedding(node.get_content())

        score = cosine_similarity(
            np.array(query_embedding).reshape(1, -1),
            np.array(node_embedding).reshape(1, -1)
        )[0][0]

        print(f"Chunk {i+1}: ID = {node_id}, Cosine Similarity = {score:.4f}")

    # Step 3: Build context
    rag_context = "\n\n".join([node.get_content() for node in retrieved_nodes])
    print("\n📄 RAG Context Retrieved:\n", rag_context, "...\n")  # Truncated display

    # Step 4: Build prompt
    prompt = f"""<|startoftext|>[INST] <<SYS>>
You are a helpful assistant. Provide an in-depth answer using the given context.avoid unecessary text.
Stop after giving one complete answer. Do not repeat information.Dont add system tags in output.
<</SYS>>

Context:
{rag_context}

Question: {query}

Answer: [/INST]"""

    # Step 5: Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            do_sample=True,
            temperature=0.3,
            top_p=0.85,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
        )

    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response_text.split("[/INST]")[-1].strip()

    print("\n🧠 Model's Answer:\n")
    print(response)

    return response


In [9]:
import json

qa_data = [
  {
    "question": "What is CAP?",
    "answer": "Cold Atmospheric Plasma (CAP) is a non-thermal, cost-effective technology with diverse applications including surface sterilization, material modification, and biomedical treatments. It operates at room temperature and generates reactive species capable of inactivating microbes without causing thermal damage to tissues."
  },
  {
    "question": "Name the authors who explored the potential of Cold Atmospheric Plasma Device for infection reduction on dental surfaces.",
    "answer": "The authors who explored this potential include Vishakha Bende, Vandan Nagar, V. Saple, M. Doshi, C. Verma, R.L. Bhardwaj, and Rajib Kar."
  },
  {
    "question": "Briefly explain the procedure employed by BARC for testing efficacy of CAP for microbe reduction on dental surfaces.",
    "answer": "BARC conducted ex-vivo experiments on 11 freshly extracted infectious human teeth. The teeth were grouped into two protocols: TP1 and TP2. In TP1, four teeth were treated individually with CAP for 5 minutes using argon plasma and compared against a control group. In TP2, three teeth were treated similarly and colony-forming units (CFU) were measured pre- and post-treatment. The treated teeth were analyzed for microbial load reduction using serial dilution and agar plating techniques."
  },
  {
    "question": "On an average, what is the percentage reduction in microbes on dental surfaces after application of Cold Atmospheric Plasma Device?",
    "answer": "On average, the CAP treatment resulted in a microbial reduction efficiency of approximately 88.7%, with some samples achieving over 99% reduction."
  },
  {
    "question": "List out the solvents produced by Solvent Production Plant in Heavy Water Plant.",
    "answer": "The Solvent Production Plant in the Heavy Water Plant produces the following solvents: Tributyl Phosphate (TBP), Tri Iso Amyl Phosphate (TIAP), Tri Octyl Phosphine Oxide (TOPO), Di Hexyl Octanamide (DHOA), and Mono Ester of Di-2-Ethyl Hexyl Phosphonic Acid (DEHPA-II)."
  },
  {
    "question": "List out the facilities visited by students during National Science day celebration conducted in BARC.",
    "answer": "Students visiting BARC during the National Science Day celebrations typically toured several key research facilities, including:Dhruva reactor FOTIA ion accelerator DRHR robotics facility Exhibitions featuring scientific posters and models Film shows and inspirational lectures by eminent scientists "
  },
  {
    "question": "Role of BARC in Kumbh mela 2025",
    "answer": "BARC contributed to Kumbh Mela 2025 by deploying advanced bio-remediation technologies to manage sewage and wastewater, ensuring hygienic conditions and preventing waterborne diseases among millions of pilgrims."
  },
  {
    "question": "List out the organizations coming under DAE.",
    "answer": "DAE oversees research centers like BARC and IGCAR, industrial units like NFC and HWB, PSUs like NPCIL and UCIL, service organizations like DPS, and autonomous institutes like TIFR and TMC."
  }
]


In [10]:
!pip install rouge-score nltk scikit-learn
!python -m nltk.downloader punkt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=caf8f745ec8f9e495abfec0feb90809c4bb43d7b3d1ede5dea7328ba3d566fc0
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
import re
import string
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def normalize(text):
    """More comprehensive text normalization"""
    text = text.lower().strip()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def calculate_f1(gold, pred):
    """Improved F1 calculation using token-level alignment"""
    gold_tokens = gold.split()
    pred_tokens = pred.split()
    
    if not gold_tokens or not pred_tokens:
        return 0.0
    
    common = Counter(gold_tokens) & Counter(pred_tokens)
    num_same = sum(common.values())
    
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    
    if (precision + recall) == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def evaluate_qa(engine, test_data, verbose=False):
    metrics = {
        'em': [],
        'f1': [],
        'rouge_l': [],
        'bleu': []
    }
    
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smoothie = SmoothingFunction().method4

    for sample in test_data:
        try:
            query = sample['question']
            gold = normalize(sample['answer'])
            
            # Get model response
            response = engine(query)
            pred = normalize(response)
            
            if verbose:
                print(f"\nQ: {query}\nGold: {gold}\nPred: {pred}")

            # Exact Match
            metrics['em'].append(int(pred == gold))
            
            # Token-level F1
            metrics['f1'].append(calculate_f1(gold, pred))
            
            # ROUGE-L
            rouge = scorer.score(gold, pred)['rougeL']
            metrics['rouge_l'].append(rouge.fmeasure)
            
            # BLEU Score
            bleu = sentence_bleu(
                [gold.split()], 
                pred.split(),
                smoothing_function=smoothie
            )
            metrics['bleu'].append(bleu)
            
        except Exception as e:
            print(f"Error evaluating sample: {e}")
            continue

    # Aggregate results
    results = {
        'exact_match': np.mean(metrics['em']) * 100,
        'f1_score': np.mean(metrics['f1']),
        'rouge_l': np.mean(metrics['rouge_l']),
        'bleu_score': np.mean(metrics['bleu']),
        'samples_evaluated': len(metrics['em'])
    }
    
    # Print comprehensive report
    print("\n=== Evaluation Results ===")
    print(f"Exact Match: {results['exact_match']:.2f}%")
    print(f"F1 Score: {results['f1_score']:.4f}")
    print(f"ROUGE-L F1: {results['rouge_l']:.4f}")
    print(f"BLEU Score: {results['bleu_score']:.4f}")
    print(f"\nEvaluated {results['samples_evaluated']}/{len(test_data)} samples")
    
    return results

# Usage:
results = evaluate_qa(rag_answer, qa_data, verbose=True)


🔍 Retrieved Chunks with Cosine Similarity Scores for: "What is CAP?"
Chunk 1: ID = b5aa2a10-099a-4e11-bab5-36dbde67d3dd, Cosine Similarity = 0.6450
Chunk 2: ID = de0e59c9-70ba-473f-9d51-6f8eef492fe5, Cosine Similarity = 0.6302
Chunk 3: ID = 97a8e63f-fe8b-4fb3-8547-50ae1e1285f8, Cosine Similarity = 0.5974
Chunk 4: ID = aac894ae-981f-488d-af7e-45659a7bb068, Cosine Similarity = 0.6060

📄 RAG Context Retrieved:
 On average, the CAP treatment demonstrates a substantial pathogenic destruction efficiency of approximately 88.7%, encompassing both TP 1 and TP 2 protocols. These results underscore the potency and efficacy of the CAP treatment delivered by the developed device in significantly reducing microbial load on dental surfaces, thereby highlighting its potential utility in clinical settings for infection control and sterilization purposes.

The developed cold atmospheric pressure plasma (CAP) device demonstrated significant efficacy in reducing microbial load on dental surfaces. Optical