# Final RAG setup, Testing and UI code file

This file consolidates the final pipeline after exploratory notebooks (initial_rag_experiments.ipynb and modular_rag_experiments.ipynb).  
the *experiment-driven choices* that materially affected performance are:

- **Fuzzy match threshold for reaction terms**: iteratively lowered **80 → 75 → 70** to improve recall without materially harming precision.
- **Retriever packing strategy**: decreased `estimated_avg_doc_tokens` **200 → 150 → 100** to fit more candidates; increased `max_k` **20 → 30 → 40** to raise recall ceiling.
- **Model choices**: BioBERT for biomedical embeddings; LLaMA 3 8B (quantized) for efficient local inference.



In [2]:
# imports
import psycopg2
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from fuzzywuzzy import process
import torch
import os
import re
from fuzzywuzzy import fuzz
from pydantic import Field
import shutil 
from typing import List, Dict, Any, Set
from sqlalchemy import create_engine 
import numpy as np 
import sys 

In [3]:
# LlamaIndex core components
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.schema import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding as LlamaIndexHuggingEmbedding 
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.retrievers import VectorIndexRetriever 

In [4]:
# LangChain components 
from langchain_community.llms import LlamaCpp 
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document as LangchainDocument
from langchain_core.retrievers import BaseRetriever as LangchainBaseRetriever 

In [5]:
# evaluation libraries
from evaluate import load 
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 
from bert_score import score as bert_score_calc 
from rouge_score.rouge_scorer import RougeScorer

In [6]:
# UI import
import gradio as gr 

In [7]:
# database configurations
DB_CONFIG = {
    "host": "localhost",
    "database": "postgres",
    "user": "postgres",
    "password": "postgres", 
    "port": "5432"
}

In [8]:
# file and model paths
BIOBERT_MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1" 
LLAMA3_MODEL_PATH = "Meta-Llama-3.1-8B-Instruct-Q6_K.gguf" 

SCRIPT_DIR = os.path.dirname(os.path.abspath("final_rag_code.ipynb"))
VECTOR_DB_PATH = os.path.join(SCRIPT_DIR, "faiss_index") 
PREPROCESSED_DATA_CSV = os.path.join(SCRIPT_DIR, "preprocessed_faers_data.csv") 


In [9]:
# user friendly term for standard categories fromed by previous experiments
category_mappings = {
    "product/administration issues": "Product/Administration Issues", "dermatological/allergic": "Dermatological/Allergic",
    "neurological": "Neurological", "gastrointestinal": "Gastrointestinal", "psychiatric": "Psychiatric",
    "cardiovascular": "Cardiovascular", "respiratory": "Respiratory", "general/systemic disorders": "General/Systemic Disorders",
    "musculoskeletal": "Musculoskeletal", "other organ systems/conditions": "Other Organ Systems/Conditions",
    "dermatological": "Dermatological/Allergic", "allergic": "Dermatological/Allergic", "allergy": "Dermatological/Allergic",
    "skin": "Dermatological/Allergic", "anaphylactic": "Dermatological/Allergic", "immune": "Dermatological/Allergic",
    "nervous system": "Neurological", "brain": "Neurological", "head": "Neurological", "cognitive": "Neurological",
    "mental health": "Psychiatric", "mental": "Psychiatric", "psychological": "Psychiatric", "behavioral": "Psychiatric",
    "stomach": "Gastrointestinal", "gut": "Gastrointestinal", "digestive": "Gastrointestinal",
    "heart": "Cardiovascular", "circulatory": "Cardiovascular",
    "lung": "Respiratory", "breathing": "Respiratory",
    "general symptoms": "General/Systemic Disorders", "systemic": "General/Systemic Disorders", "overall health": "General/Systemic Disorders",
    "pain": "General/Systemic Disorders", "functional impairment": "General/Systemic Disorders",
    "bone": "Musculoskeletal", "joint": "Musculoskeletal", "muscle": "Musculoskeletal", "muscular": "Musculoskeletal",
    "product issues": "Product/Administration Issues", "administration issues": "Product/Administration Issues",
    "drug administration": "Product/Administration Issues", "medication error": "Product/Administration Issues",
    "drug use": "Product/Administration Issues",
    "metabolic": "Other Organ Systems/Conditions", "metabolism": "Other Organ Systems/Conditions",
    "metabolic issues": "Other Organ Systems/Conditions", "infection": "Other Organ Systems/Conditions",
    "infectious": "Other Organ Systems/Conditions", "blood": "Other Organ Systems/Conditions",
    "liver": "Other Organ Systems/Conditions", "kidney": "Other Organ Systems/Conditions", "urinary": "Other Organ Systems/Conditions",
    "eye": "Other Organ Systems/Conditions", "vision": "Other Organ Systems/Conditions", "pregnancy": "Other Organ Systems/Conditions",
    "fetal": "Other Organ Systems/Conditions", "injury": "Other Organ Systems/Conditions", "investigations": "Other Organ Systems/Conditions",
    "social": "Other Organ Systems/Conditions", "ear": "Other Organ Systems/Conditions", "throat": "Other Organ Systems/Conditions",
}

## Fetching and Preprocessing Data

In [10]:
# data extraction method
def extract_data_from_postgres(db_config, table_name='merged_faers_data_new'):
    """extracts data from PostgreSQL and returns a pandas DataFrame."""
    conn = None
    try:
        conn = psycopg2.connect(**db_config)
        cursor = conn.cursor()
        query = f"""
        SELECT primaryid, pt, drugname, prod_ai, age, sex, reaction_category
            FROM {table_name}
            WHERE pt IS NOT NULL AND prod_ai IS NOT NULL
        """
        df = pd.read_sql(query, conn)
        return df
    except Exception as e:
        print(f"Error extracting data from {table_name}: {e}")
        return None
    finally:
        if conn:
            conn.close()

In [11]:
def preprocess_data(df):
    """
    preprocessings for RAG model preparation
    """
    print("Starting data preprocessing including categorization...")

    # cleanups
    df['reaction_category'] = df['reaction_category'].replace('[null]', 'Unknown Category').str.strip().str.title()
    df.fillna({
        'age': 'unknown',
        'sex': 'unknown',
        'drugname': 'unknown',
        'prod_ai': 'unknown',
        'pt': 'unknown reaction',
        'primaryid': 'unknown_id' 
    }, inplace=True)
    


    # handeling the age group logic
    df['age_group'] = "unknown age group" 
    for index, row in df.iterrows():
        if pd.notna(row['age']) and row['age'] != 'unknown':
            try:
                age_val = float(row['age'])
                if 18 <= age_val <= 64:
                    df.at[index, 'age_group'] = "adult"
                elif age_val >= 65:
                    df.at[index, 'age_group'] = "elderly"
                elif age_val < 18:
                    if age_val <= 2:
                        df.at[index, 'age_group'] = "infant"
                    elif age_val <= 11:
                        df.at[index, 'age_group'] = "child"
                    else: # 12 to 17
                        df.at[index, 'age_group'] = "adolescent"
            except ValueError:
                pass

    df['age_group'] = df['age_group'].astype(str) 

    # filling NA for critical columns
    df['prod_ai'] = df['prod_ai'].fillna('Unknown Drug')
    df['drugname'] = df['drugname'].fillna('Unknown Drug Name')
    df['sex'] = df['sex'].fillna('Unknown')
    df['pt'] = df['pt'].fillna('Unknown Reaction')
    df['reaction_category'] = df['reaction_category'].fillna('Other Organ Systems/Conditions') 
    df['age_group'] = df['age_group'].fillna('Unknown') 

    # LlamaIndex document objects
    processed_documents = [] 
    required_cols = ['primaryid', 'drugname', 'prod_ai', 'pt', 'reaction_category', 'age', 'sex', 'age_group']
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column in DataFrame for LlamaIndex Document creation: {col}")
    # getting required row values for metadata
    for _, row in df.iterrows():
        doc_text = (
            f"Adverse Drug Reaction Report:\n"
            f"Product Active Ingredient: {row['prod_ai']}. "
            f"Drug Name: {row['drugname']}. "
            f"Patient Age: {row['age']} years ({row['age_group']}). "
            f"Patient Sex: {row['sex']}. "
            f"Reported Reaction: {row['pt']}. "
            f"Reaction System Category: {row['reaction_category']}."
        )
        
        metadata = {
            'primaryid': row['primaryid'],
            'drugname': row['drugname'],
            'prod_ai': row['prod_ai'],
            'pt': row['pt'],
            'reaction_category': row['reaction_category'], 
            'age': row['age'],
            'sex': row['sex'],
            'age_group': row['age_group']
        }
        processed_documents.append(Document(text=doc_text, metadata=metadata))

    return processed_documents, df 

In [12]:
# additional drugname normalization
def normalize_drug_name(drug_name):
    """normalizes drug names by removing common suffixes and splitting combinations."""
    normalized_names = []
    components = drug_name.split('\\')
    for component in components:
        component = component.strip()
        component = re.sub(r' HYDROCHLORIDE', '', component, flags=re.IGNORECASE)
        component = re.sub(r' DIHYDROCHLORIDE', '', component, flags=re.IGNORECASE)
        component = re.sub(r' SULFATE', '', component, flags=re.IGNORECASE)
        component = re.sub(r' SODIUM', '', component, flags=re.IGNORECASE)
        normalized_names.append(component.strip())
    return " ".join(normalized_names).strip()

### Drug Name Normalization via Fuzzy Matching

This function attempts to resolve raw drug names from user queries by first checking for exact matches in the normalized `prod_ai` list, followed by fuzzy matching using `fuzz.token_set_ratio`. This was essential for improving query robustness in the RAG model


In [13]:
def fuzzy_match_drug(query_drug_name_raw, available_prod_ai_for_fuzzy, normalized_prod_ai_map, threshold=90):
    lower_query_drug = query_drug_name_raw.lower()
    
    # checking for the if there is an exact match with the normalized drugname list
    if lower_query_drug in [name.lower() for name in available_prod_ai_for_fuzzy]:
        print(f"DEBUG: Exact match of raw query '{query_drug_name_raw}' to normalized '{lower_query_drug}'.")
        original_ais = normalized_prod_ai_map.get(lower_query_drug.upper(), []) 
        if original_ais:
            # original ais associated with normalized key
            return original_ais
        return [] 
    
    # performing fuzzy matching of the drugname
    best_match_tuple = None
    if available_prod_ai_for_fuzzy:
        best_match_tuple = process.extractOne(
            lower_query_drug, 
            available_prod_ai_for_fuzzy, 
            scorer=fuzz.token_set_ratio, 
            score_cutoff=threshold 
        )
    # original active ingredient lookup for matched key
    if best_match_tuple:
        normalized_matched_drug_key = best_match_tuple[0]
        score = best_match_tuple[1]
        
        print(f"DEBUG: fuzzy matched '{query_drug_name_raw}' (score: {score}) to normalized '{normalized_matched_drug_key}'.")
        
        original_ais = normalized_prod_ai_map.get(normalized_matched_drug_key.upper(), [])
        if original_ais:
            return original_ais 
    
    print(f"DEBUG: no fuzzy match found for '{query_drug_name_raw}' with threshold {threshold}.")
    return [] 

In [14]:
# simple cleanup to normalize the set of adverse reaction terms
def normalize_pt_set(pt_set):
    return sorted([re.sub(r'\s+', ' ', pt.strip().lower()) for pt in pt_set])

### Query Parsing for Metadata-Based Filtering

This function extracts structured filters (e.g., sex, age group, drug name, reaction category) from natural language queriesenabling metadata-aware document retrieval within the RAG system. parsing logic includes regex-based pattern matching and fuzzy drug name resolution.


In [15]:
# parsing the query for filters
def parse_query_for_filters(user_query, available_prod_ai_for_fuzzy, normalized_prod_ai_map):
    filters = {}
    lower_query = user_query.lower()
    print(f"DEBUG: received query: '{user_query}'")
    print(f"DEBUG: lower_query: '{lower_query}'")

    # sex extraction
    if re.search(r'\bfemale\b|\bwomen\b|\bfemales\b|\bwomens\b', lower_query):
        filters["sex"] = "F"
    elif re.search(r'\bmale\b|\bmen\b|\bmales\b|\bmens\b', lower_query):
        filters["sex"] = "M"

    # age group extraction 
    age_extracted = False
    
    # for conditions when the query specifies over x years old
    over_age_match = re.search(r'(?:over|older than)\s*(\d+)\s*years?\s*old', lower_query)
    if over_age_match:
        age_limit = int(over_age_match.group(1))
        if age_limit >= 60: 
            filters["age_group"] = "elderly"
        elif age_limit >= 18:
            filters["age_group"] = "adult"
        age_extracted = True

    #  for conditions when the query specifies under x years old

    if not age_extracted:
        under_age_match = re.search(r'(?:under|less than)\s*(\d+)\s*years?\s*old', lower_query)
        if under_age_match:
            age_limit_upper_exclusive = int(under_age_match.group(1))
            if age_limit_upper_exclusive <= 3: 
                filters["age_group"] = "infant"
            elif age_limit_upper_exclusive <= 12: 
                filters["age_group"] = "child"
            elif age_limit_upper_exclusive <= 18: 
                filters["age_group"] = "adolescent"
            elif age_limit_upper_exclusive <= 65: 
                filters["age_group"] = "adult"
            else: 
                filters["age_group"] = "elderly" 
            age_extracted = True
    # if specific age is given
    if not age_extracted:
        exact_age_match = re.search(r'\b(\d+)\s*years?\s*old\b', lower_query)
        if exact_age_match:
            age_val = int(exact_age_match.group(1))
            if age_val <= 2: filters["age_group"] = "infant"
            elif age_val <= 11: filters["age_group"] = "child"
            elif age_val <= 17: filters["age_group"] = "adolescent"
            elif age_val <= 64: filters["age_group"] = "adult"
            else: filters["age_group"] = "elderly"
            age_extracted = True
    # fallback
    if not age_extracted:
        if "infant" in lower_query:
            filters["age_group"] = "infant"
        elif "child" in lower_query or "children" in lower_query or "pediatric" in lower_query:
            filters["age_group"] = "child"
        elif "adolescent" in lower_query or "teen" in lower_query:
            filters["age_group"] = "adolescent"
        elif "adult" in lower_query:
            filters["age_group"] = "adult"
        elif "elderly" in lower_query or "senior" in lower_query or "over 65" in lower_query:
            filters["age_group"] = "elderly"
    # drug name extractoin
    matched_drug_ais = set()
    
    # extracting the drugnames within "" (initially tried the RAG with drugname inside quotation so this step is here)
    quoted_drug_match = re.search(r"'(?P<drug_name>[a-zA-Z0-9\s-]+?)'", lower_query)
    if quoted_drug_match:
        raw_drug_name = quoted_drug_match.group('drug_name').strip()
        # fuzzy_match_drug to return a list of original active ingredients
        matched_ais_for_component = fuzzy_match_drug(raw_drug_name, available_prod_ai_for_fuzzy, normalized_prod_ai_map, threshold=90)
        if matched_ais_for_component:
            matched_drug_ais.update(matched_ais_for_component)
            print(f"DEBUG: Extracted quoted drug '{raw_drug_name}' and matched to: {matched_ais_for_component}")
        else:
            print(f"DEBUG: No fuzzy match for quoted drug: '{raw_drug_name}'")

    # keyword based extraction for drugs
    if not matched_drug_ais:
        # specific regex to get the exact drug name
        keyword_drug_match = re.search(r'(?:used|taking|with|for|drug(?: name)?|to)\s+([a-zA-Z0-9\s-]+?)(?:\s+(?:in|for|what|common|adverse|reactions|side effects|\?|$|\'|\.|,|$))', lower_query)
        if keyword_drug_match:
            raw_drug_name = keyword_drug_match.group(1).strip()
            # initially other words were being filtered as drugs, so curated this list based on the seen responses
            common_query_words_strict = ["show", "report", "list", "drug", "reactions", "adverse", "common", "events", "side", "effects", "what", "to", "a", "an", "the", "of", "patient", "group", "any"]
            if raw_drug_name.lower() not in common_query_words_strict:
                matched_ais_for_component = fuzzy_match_drug(raw_drug_name, available_prod_ai_for_fuzzy, normalized_prod_ai_map, threshold=90)
                if matched_ais_for_component:
                    matched_drug_ais.update(matched_ais_for_component)
                    print(f"DEBUG: Extracted keyword-based drug '{raw_drug_name}' and matched to: {matched_ais_for_component}")
                else:
                    print(f"DEBUG: No fuzzy match for keyword-based drug: '{raw_drug_name}'")
            else:
                print(f"DEBUG: Discarding keyword extracted word '{raw_drug_name}' as it is a common query word.")

    # checking for any known drug
    if not matched_drug_ais:
        # matching longer names first
        sorted_prod_ais = sorted(available_prod_ai_for_fuzzy, key=len, reverse=True)
        for prod_ai_norm in sorted_prod_ais:
            # for whole word match
            if re.search(r'\b' + re.escape(prod_ai_norm.lower()) + r'\b', lower_query):
                matched_ais = normalized_prod_ai_map.get(prod_ai_norm.upper(), [])
                if matched_ais:
                    matched_drug_ais.update(matched_ais)
                    print(f"DEBUG: Direct whole-word match for '{prod_ai_norm}' found and matched to: {matched_ais}")
                    break 

    if matched_drug_ais:
        filters['prod_ai'] = list(matched_drug_ais) 
    else:
        print(f"DEBUG: no prod_ai matched for query: '{user_query}'")

 
    found_category = None
    # iterating throught the category mapping for category search
    for phrase, category_name in category_mappings.items():
        if re.search(r'\b' + re.escape(phrase) + r'\b', lower_query):
            filters["reaction_category"] = category_name
            found_category = category_name
            break

    print(f"DEBUG: returning filters: {filters}")
    return filters

### Embedding Creation and FAISS Index Management

initializing BioBERT-based embeddings via LlamaIndex and managing FAISS indexing. It supports persistent storage for faster reloads and rebuilds the index only when necessary.

In [16]:
def create_embeddings_and_index(llama_index_documents, model_name, faiss_index_dir, embedding_dimension=768):
    """
    creating Biobert embeddings and build/load the faiss index
    """
    print("Loading BioBERT model (for LlamaIndex embeddings)...")
    # using GPU for better execution
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device for embeddings: {device}")
    # embedding model
    embed_model = LlamaIndexHuggingEmbedding(model_name=model_name, device=device)
    # file paths
    faiss_file = os.path.join(faiss_index_dir, "faiss.index")
    docstore_dir = os.path.join(faiss_index_dir, "docstore")
    index_store_dir = os.path.join(faiss_index_dir, "index_store")

    # trying to load the existing faiss index if it exitsts
    if os.path.exists(faiss_file) and os.path.exists(docstore_dir) and os.path.exists(index_store_dir):
        print(f"Loading FAISS index from {faiss_index_dir}...")
        try:
            faiss_index = faiss.read_index(faiss_file)
            vector_store = FaissVectorStore(faiss_index=faiss_index)
            # restoring storage context
            storage_context = StorageContext.from_defaults(
                vector_store=vector_store,
                docstore=SimpleDocumentStore.from_persist_dir(docstore_dir),
                index_store=SimpleIndexStore.from_persist_dir(index_store_dir),
            )
            # creating vector index from stored vectors
            index = VectorStoreIndex.from_documents(
                [], 
                storage_context=storage_context,
                embed_model=embed_model, 
                show_progress=False 
            )
            print("FAISS index loaded successfully.")
            return index, embed_model
        except Exception as e:
            print(f"Error loading FAISS index: {e}. Rebuilding index.")
            shutil.rmtree(faiss_index_dir, ignore_errors=True) 
            pass
    
    # case when new FAISS index is created
    print(f"FAISS index not found at {faiss_index_dir} or failed to load. Building new index.")
    # flat L2 indexing
    faiss_index = faiss.IndexFlatL2(embedding_dimension)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    # new vector store
    index = VectorStoreIndex.from_documents(
        llama_index_documents,
        storage_context=storage_context,
        embed_model=embed_model, 
        show_progress=True
    )
    print(f"FAISS index built. Saving to {faiss_index_dir}...")
    
    index.storage_context.persist(persist_dir=faiss_index_dir)
    print(f"FAISS index saved to {faiss_index_dir}")

    return index, embed_model


### Custom Retriever with Dynamic Token Budgeting and Metadata Filtering

wraps a LlamaIndex semantic retriever with two key enhancements:  
1. **Dynamic document selection** based on available context window tokens 
2. **Post-retrieval filtering** using metadata (e.g., `prod_ai`, `sex`, `age_group`)—which FAISS does not natively support.  
ensures only relevant documents are passed to the LLM within context limits while respecting user-specified filters.


In [17]:
class LlamaIndexFilteredRetriever(LangchainBaseRetriever):
    """
    for post retieval filtering as we have metadata in the dataframe 
    and that is not directly supported by faiss vector store
    """
    llama_index_index: Any = Field(...)
    max_context_tokens: int = Field(...) 
    estimated_prompt_tokens: int = Field(500) 
    max_response_tokens_llm: int = Field(500) 
    estimated_avg_doc_tokens: int = Field(100) # changed from 200 to 150 now changed to 100 based on experiments
    min_k: int = Field(3) 
    max_k: int = Field(40) #changed from 20 to 30 now changed to 40
    current_filters: Dict[str, Any] = Field(default_factory=dict) 

    def __init__(self, llama_index_index: Any, max_context_tokens: int, **kwargs: Any):
        super().__init__(
            llama_index_index=llama_index_index,
            max_context_tokens=max_context_tokens,
            **kwargs
        )

    def _get_relevant_documents(
        self, query: str, *, run_manager=None
        # returning a large batch of smeantically similar matches
    ) -> List[LangchainDocument]:
        available_tokens_for_docs = self.max_context_tokens - self.estimated_prompt_tokens - self.max_response_tokens_llm
        # dynamic value of the desired k based on available token budget
        if self.estimated_avg_doc_tokens <= 0 or available_tokens_for_docs <= 0:
            dynamic_desired_k = self.min_k 
        else:
            dynamic_desired_k = available_tokens_for_docs // self.estimated_avg_doc_tokens
        
        dynamic_desired_k = max(self.min_k, min(self.max_k, dynamic_desired_k))
        
        print(f"DEBUG: Calculated dynamic desired_k: {dynamic_desired_k} (based on {self.max_context_tokens} n_ctx, {self.estimated_prompt_tokens} prompt, {self.max_response_tokens_llm} response, {self.estimated_avg_doc_tokens} avg doc)")
        # defining the number of documents to be fetched in initial pass
        base_retriever = self.llama_index_index.as_retriever(
            similarity_top_k=2000 
        )
        retrieved_nodes_raw = base_retriever.retrieve(query)
        print(f"DEBUG: Initial semantic retrieval returned {len(retrieved_nodes_raw)} nodes.")

        filtered_nodes = []
        #applying metadata filters in the fetched documents
        if self.current_filters:
            print(f"DEBUG: Applying metadata filters: {self.current_filters}")
            for node_with_score in retrieved_nodes_raw:
                doc = node_with_score.node
                metadata = doc.metadata
                match = True
                #comparing each filter key and value with metadata node
                for key, value in self.current_filters.items():
                    metadata_value = str(metadata.get(key, '')).lower()
                    # matching for the drug active ingredient filter
                    if key == 'prod_ai':
                        if not any(ai.lower() == metadata_value for ai in value):
                            match = False
                            break
                    else:
                        filter_value = str(value).lower()
                        if metadata_value != filter_value:
                            match = False
                            break
                    
                if match:
                    filtered_nodes.append(node_with_score)
            print(f"DEBUG: After filtering, {len(filtered_nodes)} nodes remain.")
        else:
            filtered_nodes = retrieved_nodes_raw
        # limiting the code to desired value
        final_retrieved_nodes = filtered_nodes[:dynamic_desired_k]
        print(f"DEBUG: Selected {len(final_retrieved_nodes)} documents (out of {len(filtered_nodes)} filtered) for LLM context.")
        # converting llamaindex nodes to langchain documents
        langchain_docs = []
        for node_with_score in final_retrieved_nodes:
            langchain_docs.append(LangchainDocument(
                page_content=node_with_score.node.text,
                metadata=node_with_score.node.metadata
            ))
        return langchain_docs

### RAG System Initialization and Prompt Configuration

defines the global Retrieval-Augmented Generation (RAG) components, including:

- **strict extraction prompt** (`QA_CHAIN_PROMPT_CLEAN`)to enforce reaction reporting from FAERS documents without hallucination based on the multiple experiments and viewing of the results.


In [18]:
# LLM and prompt setup
llm_instance = None
_qa_chain_global = None
_dynamic_filtered_retriever_global = None
_available_prod_ai_for_fuzzy_global = None
_normalized_prod_ai_map_global = None

# Prompt for the LLM after multiple iterations
QA_CHAIN_PROMPT_CLEAN = PromptTemplate.from_template(
"""Context:
{context}
Question: {question}
Extract and list ALL unique reported reactions *STRICTLY AND ONLY* from the "Reported Reaction" field within the provided Context.
DO NOT generate any reactions that are not explicitly present in the 'Reported Reaction' field of the Context documents.
DO NOT include any other information, such as drug names, patient demographics (age, sex), or reaction categories.
Each reaction must be on a NEW, SEPARATE LINE, prefixed with a hyphen and a single space (e.g., "- reaction name").
DO NOT provide any introductory phrases, concluding remarks, explanations, questions, or code blocks.
If no relevant reactions are found in the Context, respond ONLY with "No reactions found."

Reported Reactions:
""")

# global RAG components
_rag_initialized = False
_data_df_global = None  
_llama_index_index_global = None  
_llama_index_embed_model_global = None 
_available_prod_ai_for_fuzzy_global = None  
_normalized_prod_ai_map_global = None  
_dynamic_filtered_retriever_global = None  
_qa_chain_global = None  

# used for non-gradio environment i.e when running in console
class DummyProgress:
    def update(self, *args, **kwargs):
        pass

# method to initialize the RAG components
def _initialize_rag_components():
    """Initializes all heavy RAG components (data, index, LLM chain)."""
    global _rag_initialized, _data_df_global, _llama_index_index_global, \
           _available_prod_ai_for_fuzzy_global, _normalized_prod_ai_map_global, \
           _dynamic_filtered_retriever_global, _qa_chain_global, llm_instance, \
           _llama_index_embed_model_global 
    # if already initialized i.e running it the second time
    if _rag_initialized:
        print("RAG components already initialized.")
        return

    print("Initializing RAG components for the first time...")
    
    data_df_raw = None
    # loading the dataset if it exists already in the file
    if os.path.exists(PREPROCESSED_DATA_CSV):
        print(f"Loading preprocessed data from {PREPROCESSED_DATA_CSV}...")
        try:
            data_df_raw = pd.read_csv(PREPROCESSED_DATA_CSV)
            print("Preprocessed data loaded successfully from CSV.")
        except Exception as e:
            print(f"Error loading preprocessed data from CSV: {e}. Falling back to PostgreSQL extraction and reprocessing.")
            data_df_raw = None 
    # if the data does not exist already in a csv format, load it directly from the database view
    if data_df_raw is None: 
        data_df_raw = extract_data_from_postgres(DB_CONFIG, table_name='merged_faers_data_new')
        if data_df_raw is None:
            raise Exception("Failed to extract raw data from PostgreSQL.")
        #applying pre-processing to the data because it is loaded from database
        processed_documents, _data_df_global = preprocess_data(data_df_raw.copy())
        
        _data_df_global.to_csv(PREPROCESSED_DATA_CSV, index=False)
        print(f"Preprocessed data and saved to {PREPROCESSED_DATA_CSV}.")
    else:
        processed_documents, _data_df_global = preprocess_data(data_df_raw.copy())

    # fallback case for when the db connection is invalid and file is not found
    if processed_documents is None or _data_df_global is None:
        raise Exception("Failed to prepare data for RAG components.")
    # normalized drugname mapping for fuzzy search
    available_prod_ai = _data_df_global['prod_ai'].unique().tolist()
    _normalized_prod_ai_map_global = {}
    for original_ai in _data_df_global['prod_ai'].unique():
        components_to_normalize = original_ai.split('\\')
        for comp in components_to_normalize:
            normalized_base_name = normalize_drug_name(comp)
            if normalized_base_name:
                normalized_base_name = normalized_base_name.upper()
                if normalized_base_name not in _normalized_prod_ai_map_global:
                    _normalized_prod_ai_map_global[normalized_base_name] = []
                if original_ai not in _normalized_prod_ai_map_global[normalized_base_name]:
                    _normalized_prod_ai_map_global[normalized_base_name].append(original_ai)
    _available_prod_ai_for_fuzzy_global = list(_normalized_prod_ai_map_global.keys())
    # creating/loading embeddings and FAISS vectorIndex
    _llama_index_index_global, _llama_index_embed_model_global = create_embeddings_and_index(
        processed_documents, BIOBERT_MODEL_NAME, VECTOR_DB_PATH, embedding_dimension=768
    )
    if _llama_index_index_global is None:
        raise Exception("Failed to create/load LlamaIndex.")
    
    # initializing the llm and binding to the retrievalqa chain
    # the stop sequences were built by observing the output from the llm multiple time so that the final output came as desired
    # the temperature value was set based on experiments to stop llm hallucinating
    llm_instance = LlamaCpp(
        model_path=LLAMA3_MODEL_PATH,
        temperature=0.2,  
        max_tokens=500,  
        n_ctx=4096, 
        n_gpu_layers=-1, 
        verbose=False,
        stop=[
            "```", "\n```", "\n\n```", "```python", "```json", "```text", "```yaml", "```bash",
            "\n\nNote:", "\nNote:",  
            "\n\nAnswer:", "\nAnswer:",
            "\n\nExplanation:", "\nExplanation:",
            "\n\nResponse:", "\nResponse:",
            "\n\nBased on the provided Context,",
            "\n\nList all reported reactions",
            "List of Adverse Events for Males over 60 years old who took Fexofenadine:",  
            "No further information is required.",  
            "The list above already contains all the relevant reactions.",  
            "\"No reactions found.\" was removed from the output as it is not applicable to this problem.",
            "\"No reactions found.\" was removed from the output",  
            "not applicable to this problem.",
            "was removed from the output",
            "It should be removed from the list.",
            "The list is now correct.",
            "There are ",  
            "\n\nIf no relevant reactions are found",
            "\n\nTherefore, the Answer is correct.",
            "\n\nThis indicates that the system has correctly identified",
            "\n\nFinally, the system has correctly prefixed",
            "\n- dermatological/Allergic.",  
            "\n- respiratory.",  
            "\n- neurological.",
            "\n\nI hope this helps.",
            "\n\nLet me know if you have any other questions.",
            "\n\nIf there are no reactions found,",
            "\n\nHere are the reactions:",
            "\n\nHere is a list of reactions:",
            "\n\nHere's the information:",
            "adverse reaction report",
            "\n\n\n\n",
            "(note:",          
            "corrected answer:", 
            "answer:",         
            "Reported Reactions:",
            "Based on the context, the reported reactions are:",
            "The reported reactions are:"
        ],
    )
    # checking for GPU availability and using it if available else fallback to CPU
    if torch.cuda.is_available():
        print(f"CUDA is available. LlamaCpp use GPU with n_gpu_layers={llm_instance.n_gpu_layers}.")
        print(f"Current CUDA device: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA is NOT available.")

    _dynamic_filtered_retriever_global = LlamaIndexFilteredRetriever(
        llama_index_index=_llama_index_index_global,
        max_context_tokens=llm_instance.n_ctx, 
        estimated_prompt_tokens=500, 
        max_response_tokens_llm=llm_instance.max_tokens, 
        estimated_avg_doc_tokens=100, #changed from 200 to 150, now changed to 100
        min_k=3, 
        max_k=40 #changed from 20 to 30 now changed to 40
    )
    # qa chain definition
    _qa_chain_global = RetrievalQA.from_chain_type(
        llm_instance,  
        retriever=_dynamic_filtered_retriever_global,
        return_source_documents=True,
        chain_type="stuff",
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT_CLEAN, "document_variable_name": "context"}  
    )
    
    _rag_initialized = True
    print("RAG components initialized successfully.")

### Reaction Extraction and Fuzzy Filtering Logic

post-processing the raw LLM output to isolate **true adverse reaction terms (PTs)**. It does the following:

- **Line-based Parsing**: It parses line-by-line outputs prefixed with `"- "` .
- **Keyword Filtering**: Filters out common non-reaction phrases and high-level reaction categories using a curated set of domain-specific stopwords and MedDRA-based mappings.
- **Fuzzy Matching**: Uses `fuzz.ratio` (from `fuzzywuzzy`) to match noisy or slightly miswritten LLM outputs to known `PT` terms.  
  - **Threshold tuning**: The similarity threshold was experimentally lowered from **80 → 75 → 70** to improve **recall** in PT extraction without introducing significant noise.

In [19]:
# function for extracting and filtering reactions (used by run_rag_query and for direct console output)
def _extract_and_filter_reactions(llm_output: str, allowed_pts_from_retrieval: Set[str] = None) -> set:
    cleaned_reactions = set()
    lines = llm_output.split('\n')
    # common non-reaction metadata terms to filter out (case-insensitive) based on multiple answer investigation from the RAG model
    non_reaction_keywords = {
        "patient age", "patient sex", "drug name", "product active ingredient",
        "reaction system category", "unknown patient age", "unknown patient sex",
        "unknown reaction system category", "prescription drug used without a prescription",
        "cetirizine 10 mg tablets", "cetirizine hydrochloride", 
        "adult patient", "elderly patient", "female patient", "male patient",
        "product/administration issues", "dermatological/allergic", "neurological",
        "gastrointestinal", "psychiatric", "cardiovascular", "respiratory",
        "general/systemic disorders", "musculoskeletal", "other organ systems/conditions","adverse reaction report",
        "no reactions found", 
    }
    for cat_phrase in category_mappings.keys():
        non_reaction_keywords.add(cat_phrase.lower())
    for cat_value in category_mappings.values():
        non_reaction_keywords.add(cat_value.lower())
    # grabbing the known preferred terms from the global dataframe for normalizatoin
    global _data_df_global
    known_pts_overall = set(_data_df_global['pt'].str.lower().unique()) if _data_df_global is not None else set()
    # looping through the LLM output lines and only looking at those that start with "-"
    for line in lines:
        line = line.strip()
        if line.startswith('- ') and len(line) > 2:
            potential_reaction = line[2:].strip().lower()

            is_non_reaction = False
            # skipping if non reaction keyword
            for keyword in non_reaction_keywords:
                if keyword in potential_reaction:
                    is_non_reaction = True
                    break
            
            if not is_non_reaction:
                potential_reaction = re.sub(r'\s*\|\s*$', '', potential_reaction).strip() 
                potential_reaction = re.sub(r'\s+', ' ', potential_reaction).strip() 

                if not potential_reaction:
                    continue 

                target_pts_for_matching = allowed_pts_from_retrieval if allowed_pts_from_retrieval is not None else known_pts_overall
                # threshold setting for confidence
                if target_pts_for_matching:
                    best_match, score = process.extractOne(potential_reaction, target_pts_for_matching, scorer=fuzz.ratio)
                    if score >= 70: # changed from 80 to 75 now changed to 70
                        cleaned_reactions.add(best_match)
    return cleaned_reactions

### RAG Inference Pipeline

This method represents the end-to-end RAG inference pipeline, taking a natural language query and returning:
1. A **cleaned, filtered list of reactions** (PTs),
2. The **source document metadata** used for generation,
3. The **original PTs retrieved** from the filtered FAERS dataset.

In [20]:
# method to run the RAG model
def run_rag_query(query: str, progress: Any = None) -> (str, str, Set[str]):
    """
    processes a user query through the RAG model and returns
    the extracted reactions and source metadata. 
    """
    # for first time run
    if progress and hasattr(progress, 'update'):
        progress.update(0, desc="Initializing RAG components (if first run)...")
    
    _initialize_rag_components() 
    # default fallbacks
    model_response = "Error: Something went wrong."
    sources_used_text = "No sources available."
    allowed_pts_from_retrieval = set() 

    try:
        # parsing filter from the query
        if progress and hasattr(progress, 'update'):
            progress.update(0.2, desc="Parsing query and extracting filters.")
        # extracting the search filters            
        retriever_filters = parse_query_for_filters(query, _available_prod_ai_for_fuzzy_global, _normalized_prod_ai_map_global)
        # if valid drug is not found, setting up a custom response        
        if 'prod_ai' not in retriever_filters or not retriever_filters['prod_ai']:
            model_response = "No reactions found for this drug. The drug name was not recognized or is not in the database."
            sources_used_text = "No source documents were retrieved as the drug was not recognized."
            return model_response, sources_used_text, allowed_pts_from_retrieval 
        # document retrieval
        if progress and hasattr(progress, 'update'):
            progress.update(0.4, desc="Retrieving relevant documents.")
        _dynamic_filtered_retriever_global.current_filters = retriever_filters
        initial_retrieved_docs_for_llm = _dynamic_filtered_retriever_global._get_relevant_documents(query)
        # condition if no matching document is found
        if not initial_retrieved_docs_for_llm:
            model_response = "No reactions found for the specified criteria."
            sources_used_text = "No source documents were returned as no relevant documents were found after filtering."
            return model_response, sources_used_text, allowed_pts_from_retrieval 
        # getting the value of adverse reaction preferred term from the documents
        for doc in initial_retrieved_docs_for_llm:
            if 'pt' in doc.metadata:
                allowed_pts_from_retrieval.add(str(doc.metadata['pt']).lower())
        print(f"DEBUG: Allowed PTs from retrieved documents: {allowed_pts_from_retrieval}")
        # generating the LLM response
        if progress and hasattr(progress, 'update'):
            progress.update(0.7, desc="Generating response with LLM.")
        result = _qa_chain_global({"query": query})

        if progress and hasattr(progress, 'update'):
            progress.update(0.9, desc="Post-processing LLM output...")
        raw_llm_result = result["result"]
        
        generated_reactions = _extract_and_filter_reactions(raw_llm_result, allowed_pts_from_retrieval)
        # generating a sorted list of reactions from the response
        final_response_list = sorted(list(generated_reactions))
        if final_response_list:
            model_response = "\n".join([f"- {r}" for r in final_response_list])
        else:
            model_response = "No reactions found for the specified criteria in the provided context."
        # getting the source results for transparency
        if "source_documents" in result and result["source_documents"]:
            sources_list = []
            for i, doc in enumerate(result["source_documents"]):
                sources_list.append(f"Document {i+1}:\n  Content snippet: {doc.page_content[:200]}...\n  Metadata: {doc.metadata}")
            sources_used_text = "\n\n".join(sources_list) 
        else:
            sources_used_text = "No source documents were returned."

    except Exception as e:
        model_response = f"An internal error occurred: {str(e)}"
        sources_used_text = "No sources available due to error."
        print(f"Error in run_rag_query: {e}")
    # complete
    if progress and hasattr(progress, 'update'):
        progress.update(1.0, desc="Done!")
    return model_response, sources_used_text, allowed_pts_from_retrieval 

### Generates the response for queries given in batches

Used during to initially evaluate the quality of responses, stop sequences and prompting strategies

In [21]:
# generating the response for test queries list or any queries
def generate_responses_for_queries(query_list: List[str]):
    """
    takes a list of queries and passes them one-by-one to the rag mofel for response
    used for the batch evaluation of the test cases
    """
    print("\n--- Generating Responses for Provided Queries ---")
    _initialize_rag_components() 

    for i, query in enumerate(query_list):
        print(f"\n--- Processing Query {i+1}/{len(query_list)}: '{query}' ---")
        
        model_response, sources_used_text, extracted_pts_set = run_rag_query(
            query, 
            progress=DummyProgress() 
        )

        print("\n--- Model Response")
        print(model_response)

        print("\n--- Sources Used (Metadata)")
        print(sources_used_text)

        print("\n--- Extracted PTs for expected_retrieved_pts")
        print(extracted_pts_set)
        print("-" * 80) 

    print("\n--- Finished Generating Responses ---")

# evaluation metrics initialization

bertscore = load("bertscore")
rouge = load("rouge")
bleu = load("bleu")

### Evaluates the metric score for test cases

In [22]:
# method for RAG model evaluation
def evaluate_rag_model(test_cases: List[Dict[str, Any]]):
    """
    calculates the evaluation metrics for the RAG queries.
    """
    print("\n--- Starting RAG Model Evaluation")
    # rag component initialization
    _initialize_rag_components()
    print("RAG components initialized for evaluation.")
    # variables to store the metrics as average value of the metric is calculated later on
    all_retrieval_precisions = []
    all_retrieval_recalls = []
    all_hit_rates = []
    all_bert_f1_scores = []
    all_rouge_l_f1_scores = [] 
    all_bleu_scores = []
    all_jaccard_scores = []
    # looping through the test cases
    for i, test_case in enumerate(test_cases):
        print(f"\n--- Running Test Case {i+1}/{len(test_cases)} ---")
        query = test_case["query"]
        ground_truth_reactions = test_case["expected_pts"]
        expected_retrieved_pts = test_case["expected_pts"]

        print(f"Query: {query}")
        print(f"Ground Truth Reactions: {ground_truth_reactions}")
        print(f"Expected Retrieved PTs (for retrieval eval): {expected_retrieved_pts}")

        
        # original_parse_query_for_filters = globals().get('parse_query_for_filters')
        # if "filters" in test_case:
        #     def _temp_parse_query_for_filters(q, available_prod_ai, normalized_prod_ai_map):
        #         print(f"DEBUG: Using test case filters: {test_case['filters']}")
        #         return test_case["filters"]
        #     globals()['parse_query_for_filters'] = _temp_parse_query_for_filters

        # model_response, sources_used_text, actual_retrieved_pts_from_run = run_rag_query(query, progress=DummyProgress())
        # globals()['parse_query_for_filters'] = original_parse_query_for_filters

        # print(f"\nModel Response:\n{model_response}")
        # print(f"\nSources Used:\n{sources_used_text}")

        # print(f"Actual Retrieved PTs (from RAG sources): {actual_retrieved_pts_from_run}")

        # generated_reactions = _extract_and_filter_reactions(model_response, actual_retrieved_pts_from_run)
        # print(f"Generated Reactions (from LLM output, post-filtered): {generated_reactions}")
        
        # getting the model responses, sources and the actual retrieved reactions
        model_response, sources_used_text, actual_retrieved_pts_from_run = run_rag_query(query, progress=DummyProgress())

        print(f"\nModel Response:\n{model_response}")
        print(f"\nSources Used:\n{sources_used_text}")

        print(f"Actual Retrieved PTs (from RAG sources): {actual_retrieved_pts_from_run}")
        # generated reactoins
        generated_reactions = _extract_and_filter_reactions(model_response, actual_retrieved_pts_from_run)
        print(f"Generated Reactions (from LLM output, post-filtered): {generated_reactions}")
        # retrieval metrics evaluation

        # retrieval precision calculation
        if len(actual_retrieved_pts_from_run) > 0:
            relevant_retrieved_count = len(expected_retrieved_pts.intersection(actual_retrieved_pts_from_run))
            retrieval_precision = relevant_retrieved_count / len(actual_retrieved_pts_from_run)
            all_retrieval_precisions.append(retrieval_precision)
        else:
            retrieval_precision = 0.0 
            all_retrieval_precisions.append(retrieval_precision)
        print(f"Retrieval Precision: {retrieval_precision:.2f}")

        # retrieval recall evaluation
        if len(expected_retrieved_pts) > 0:
            relevant_retrieved_count = len(expected_retrieved_pts.intersection(actual_retrieved_pts_from_run))
            retrieval_recall = relevant_retrieved_count / len(expected_retrieved_pts)
            all_retrieval_recalls.append(retrieval_recall)
        else:
            # if expected is empty, recall is 1.0 if nothing was retrieved, else 0.0 because wrongly retrieved something
            retrieval_recall = 1.0 if len(actual_retrieved_pts_from_run) == 0 else 0.0
            all_retrieval_recalls.append(retrieval_recall)
        print(f"Retrieval Recall: {retrieval_recall:.2f}")

        # hit rate is 1 if any expected retrieved PTs were actually retrieved, or if both are empty
        hit_rate = 1 if (len(expected_retrieved_pts.intersection(actual_retrieved_pts_from_run)) > 0) or \
                        (not expected_retrieved_pts and not actual_retrieved_pts_from_run) else 0
        all_hit_rates.append(hit_rate)
        print(f"Hit Rate: {hit_rate}")

        # normalizing the retreived adverse term sets
        predictions_list = normalize_pt_set(generated_reactions)
        references_list = normalize_pt_set(ground_truth_reactions)
        
        # generation metrics

        # if both prediction list and reference list is empty then assigning perfect score because it is the expected reaction
        if not predictions_list and not references_list:
            print("BLEU, BERTScore, and ROUGE-L skipped (both generated and ground truth reactions are empty).")
            all_bert_f1_scores.append(1.0)
            all_rouge_l_f1_scores.append(1.0)
            all_bleu_scores.append(1.0)
        # if only one of them is empty assigning 0
        elif not predictions_list or not references_list:
            print("Skipping BERTScore, ROUGE, BLEU for this test case due to one empty set of reactions (mismatch).")
            all_bert_f1_scores.append(0.0)
            all_rouge_l_f1_scores.append(0.0)
            all_bleu_scores.append(0.0)
        else:
            # BERTScore evaluaiton
            try:
                P, R, F1 = bert_score_calc(predictions_list, [references_list]*len(predictions_list), lang="en", device='cuda')
                bert_f1 = F1.mean().item()
                all_bert_f1_scores.append(bert_f1)
                print(f"BERTScore F1: {bert_f1:.2f}")
            except Exception as e:
                print(f"Error calculating BERTScore: {e}")
                all_bert_f1_scores.append(0.0)

            predictions_text_for_metrics = " ".join(predictions_list)
            references_text_for_metrics = " ".join(references_list)

            # BLEU evaluation
            try:
                # refernce sentence and candidate sentence needs to be tokenized
                # using sorted list as a single sentence
                tokenized_predictions = predictions_text_for_metrics.split()
                tokenized_references = [references_text_for_metrics.split()] # List of reference sentences

                # handeling the empty prediction set
                if not tokenized_predictions:
                    bleu_score = 0.0
                else:
                    # smooting function where no common n-grams is found
                    chencherry = SmoothingFunction()
                    bleu_score = sentence_bleu(tokenized_references, tokenized_predictions, smoothing_function=chencherry.method1)
                
                all_bleu_scores.append(bleu_score)
                print(f"BLEU Score: {bleu_score:.2f}")
            except Exception as e:
                print(f"Error calculating BLEU: {e}")
                all_bleu_scores.append(0.0)
            
            # ROUGE-L evaluation
            try:
                scorer = RougeScorer(['rougeL'], use_stemmer=True)
                rouge_scores = scorer.score(references_text_for_metrics, predictions_text_for_metrics)
                rouge_l_f1 = rouge_scores['rougeL'].fmeasure
                all_rouge_l_f1_scores.append(rouge_l_f1)
                print(f"ROUGE-L F1: {rouge_l_f1:.2f}")
            except Exception as e:
                print(f"Error calculating ROUGE-L: {e}")
                all_rouge_l_f1_scores.append(0.0)

            # Jaccard score evaluation
            try:
                jaccard_score = len(set(predictions_list).intersection(set(references_list))) / \
                                len(set(predictions_list).union(set(references_list)))
                all_jaccard_scores.append(jaccard_score)
                print(f"Jaccard Similarity: {jaccard_score:.2f}")
            except Exception as e:
                print(f"Error calculating Jaccard Similarity: {e}")
                all_jaccard_scores.append(0.0)
            
    # overall average score for the model
    print("\n--- Overall RAG Model Evaluation Results ")
    if all_retrieval_precisions:
        print(f"Average Retrieval Precision: {sum(all_retrieval_precisions) / len(all_retrieval_precisions):.2f}")
    if all_retrieval_recalls:
        print(f"Average Retrieval Recall: {sum(all_retrieval_recalls) / len(all_retrieval_recalls):.2f}")
    if all_hit_rates:
        print(f"Average Hit Rate: {sum(all_hit_rates) / len(all_hit_rates):.2f}")
    if all_bert_f1_scores:
        print(f"Average BERTScore F1: {sum(all_bert_f1_scores) / len(all_bert_f1_scores):.2f}")
    if all_rouge_l_f1_scores: # Changed to f1 for consistency
        print(f"Average ROUGE-L F1: {sum(all_rouge_l_f1_scores) / len(all_rouge_l_f1_scores):.2f}")
    if all_bleu_scores:
        print(f"Average BLEU Score: {sum(all_bleu_scores) / len(all_bleu_scores):.2f}")
    if all_jaccard_scores:
        print(f"Average Jaccard Score: {sum(all_jaccard_scores) / len(all_jaccard_scores):.2f}")

    return all_retrieval_precisions, all_retrieval_recalls, all_hit_rates, all_bert_f1_scores, all_rouge_l_f1_scores, all_bleu_scores

In [23]:
# main method for the RAG model
if __name__ == "__main__":
    print("\n--- Starting RAG Model Script ---")
    print("Initializing RAG components (this may take a few minutes on first run)...")
    try:
        _initialize_rag_components() 
    except Exception as e:
        print(f"FATAL ERROR during RAG initialization: {e}")
        sys.exit(1) 

    print("\n--- RAG System Ready")
    print("use 'evaluate_rag_model(test_cases)' for testing the data")
    print("'generate_responses_for_queries(query_list)' to get the answer for the query")


--- Starting RAG Model Script ---
Initializing RAG components (this may take a few minutes on first run)...
Initializing RAG components for the first time...
Loading preprocessed data from C:\Users\utsav\Desktop\research\updated\trimester2_updates\preprocessed_faers_data.csv...
Preprocessed data loaded successfully from CSV.
Starting data preprocessing including categorization...
Loading BioBERT model (for LlamaIndex embeddings)...
Using device for embeddings: cuda


No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.1. Creating a new one with mean pooling.


FAISS index not found at C:\Users\utsav\Desktop\research\updated\trimester2_updates\faiss_index or failed to load. Building new index.


Parsing nodes:   0%|          | 0/29722 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1050 [00:00<?, ?it/s]

FAISS index built. Saving to C:\Users\utsav\Desktop\research\updated\trimester2_updates\faiss_index...
FAISS index saved to C:\Users\utsav\Desktop\research\updated\trimester2_updates\faiss_index


llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


CUDA is available. LlamaCpp use GPU with n_gpu_layers=-1.
Current CUDA device: NVIDIA GeForce RTX 3050 6GB Laptop GPU
RAG components initialized successfully.

--- RAG System Ready
use 'evaluate_rag_model(test_cases)' for testing the data
'generate_responses_for_queries(query_list)' to get the answer for the query


### Curated test cases from database output

In [24]:
test_cases = [
    {
        "query": "What skin reactions are reported for adult females who used desloratadine?",
        "filters": {'sex': 'F', 'age_group': 'adult', 'prod_ai': ['DESLORATADINE'], 'reaction_category': 'Dermatological/Allergic'},
        "expected_pts": { 
            "pruritus",
            "rash",
            "erythema",
            "hyperhidrosis",
            "hypersensitivity",
            "skin disorder",
            "guttate psoriasis",
            "photosensitivity reaction",
            "rash erythematous",
            "skin dystrophy",
            "skin lesion",
            "dermatitis exfoliative generalised",
            "erythema nodosum",
            "generalised erythema",
            "pallor",
            "pruritus generalised",
            "rash maculo-papular",
            "skin plaque",
            "skin ulcer",
            "skin wound"
        }
    },
    {
        "query": "List all dermatological adverse events for males over 60 years old who took FEXOFENADINE.",
        "filters": {'sex': 'M', 'age_group': 'elderly', 'prod_ai': ['FEXOFENADINE'], 'reaction_category': 'Dermatological/Allergic'},
        "expected_pts": {
            "acute generalised exanthematous pustulosis",
            "eczema",
            "erythema",
            "hypersensitivity",
            "lip swelling",
            "pruritus",
            "rash pruritic",
            "skin disorder",
            "skin irritation",
            "skin mass"
        }

    },
    {
        "query": "Are there any gastrointestinal reactions reported for children using CETIRIZINE?",
        "filters": {'age_group': 'child', 'prod_ai': ['CETIRIZINE'], 'reaction_category': 'Gastrointestinal'},
        "expected_pts": {
            "abdominal discomfort",
            "abdominal pain",
            "abdominal pain upper",
            "coeliac disease",
            "diarrhoea",
            "gastrooesophageal reflux disease",
            "nausea",
            "vomiting"
            
        }
    },
    
    {
        "query": "List any adverse effects of Pseudoephedrine in infants.",
        "filters": {'age_group': 'infant', 'prod_ai': ['PSEUDOEPHEDRINE']},
        "expected_pts": set()
    },
    {
        "query": "What are the psychiatric adverse events for young adults using MONTELUKAST?",
        "filters": {'age_group': 'adolescent', 'prod_ai': ['MONTELUKAST'], 'reaction_category': 'Psychiatric'}, 
        "expected_pts": set()
    },
    {
        "query": "Report any adverse reactions to a fictional drug called 'ZYX-987' for any patient group.",
        "filters": {'prod_ai': ['ZYX-987']}, 
        "expected_pts": set()
    },
    {
        "query": "Show respiratory reactions to DESLORATADINE in males under 12 years old.",
        "filters": {'sex': 'M', 'age_group': 'child', 'prod_ai': ['DESLORATADINE'], 'reaction_category': 'Respiratory'},
        "expected_pts": set()
    },
    {
        "query": "What are the cardiovascular reactions of CETIRIZINE in elderly patients?",
        "filters": {'age_group': 'elderly', 'prod_ai': ['CETIRIZINE'], 'reaction_category': 'Cardiovascular'},
        "expected_pts": { "angina pectoris",
                    "arrhythmia",
                    "arrhythmia supraventricular",
                    "atrial fibrillation",
                    "atrial tachycardia",
                    "atrioventricular block",
                    "atrioventricular block complete",
                    "bradycardia",
                    "bradyphrenia",
                    "bradypnoea",
                    "cardiac arrest",
                    "cardiac disorder",
                    "cardiac failure",
                    "cardiac failure chronic",
                    "cardiospasm",
                    "electrocardiogram qt prolonged",
                    "heart rate increased",
                    "heart rate irregular",
                    "hypertension",
                    "myocardial infarction",
                    "palpitations",
                    "presyncope",
                    "syncope",
                    "tachycardia",
                    "ventricular tachycardia"
        }

    },
    {
        "query": "Show neurological reactions for adult males taking LORATADINE.",
        "filters": {'sex': 'M', 'age_group': 'adult', 'prod_ai': ['LORATADINE'], 'reaction_category': 'Neurological'},
        "expected_pts": { "ageusia",
                                    "anosmia",
                                    "asthenia",
                                    "dizziness",
                                    "dizziness postural",
                                    "dyskinesia",
                                    "headache",
                                    "impaired work ability",
                                    "insomnia",
                                    "muscle contractions involuntary",
                                    "muscle oedema",
                                    "muscle spasms",
                                    "muscular weakness",
                                    "musculoskeletal chest pain",
                                    "musculoskeletal discomfort",
                                    "myalgia",
                                    "paraesthesia",
                                    "paranoia",
                                    "parkinsonism",
                                    "rhabdomyolysis",
                                    "somnolence"
        }
    },
    {
        "query": "Are there any general systemic disorders reported for infants on DESLORATADINE?",
        "filters": {'age_group': 'infant', 'prod_ai': ['DESLORATADINE'], 'reaction_category': 'General/Systemic Disorders'},
        "expected_pts": set()
    },
    {
        "query": "What are the musculoskeletal adverse events for females using FEXOFENADINE?",
        "filters": {'sex': 'F', 'prod_ai': ['FEXOFENADINE'], 'reaction_category': 'Musculoskeletal'},
        "expected_pts": { "arthralgia",
                            "arthritis",
                            "back pain",
                            "costochondritis",
                            "joint swelling",
                            "pain in extremity"
        }


    },
    {
        "query": "Report product administration issues for any patient taking MONTELUKAST.",
        "filters": {'prod_ai': ['MONTELUKAST'], 'reaction_category': 'Product/Administration Issues'},
        "expected_pts": set()
    },
    
    {
        "query": "Are there any adverse effects for a drug called 'XYZ-123'?",
        "filters": {'prod_ai': ['XYZ-123']}, 
        "expected_pts": set()
    },
 
    {
        "query": "What are the respiratory issues for children using FEXOFENADINE?",
        "filters": {'age_group': 'child', 'prod_ai': ['FEXOFENADINE'], 'reaction_category': 'Respiratory'},
        "expected_pts": { "dyspnoea"}
	},
    {
        "query": "List allergic reactions to DESLORATADINE in elderly females.",
        "filters": {'sex': 'F', 'age_group': 'elderly', 'prod_ai': ['DESLORATADINE'], 'reaction_category': 'Dermatological/Allergic'},
        "expected_pts": { "eczema",
                "hyperhidrosis",
                "hypersensitivity",
                "pallor",
                "pruritus"
        }

    },
    {
        "query": "Are there any general disorders for adult males taking MONTELUKAST?",
        "filters": {'sex': 'M', 'age_group': 'adult', 'prod_ai': ['MONTELUKAST'], 'reaction_category': 'General/Systemic Disorders'},
        "expected_pts": set()
    },
    {
        "query": "What are the gastrointestinal effects of cetirizine in infants?",
        "filters": {'age_group': 'infant', 'prod_ai': ['CETIRIZINE'], 'reaction_category': 'Gastrointestinal'},
        "expected_pts": { "abdominal discomfort",
                    "abdominal distension",
                    "abdominal pain",
                    "abnormal faeces"
        }

    },
    {
        "query": "Report any psychiatric issues with LORATADINE in children.",
        "filters": {'age_group': 'child', 'prod_ai': ['LORATADINE'], 'reaction_category': 'Psychiatric'},
        "expected_pts": { "abnormal behaviour"
                    ,"aggression"
                    ,"agitation"
                    ,"altered state of consciousness"
                    ,"antisocial behaviour"
                    ,"anxiety"
                    ,"confusional state"
                    ,"delirium"
                    ,"depressed level of consciousness"
                    ,"depression"
                    ,"hallucination"
                    ,"loss of consciousness"
                    ,"panic reaction"
                    ,"psychomotor hyperactivity"
                    ,"self-injurious ideation"
                    ,"shock"
                    ,"suicidal ideation"
                    ,"suicide attempt"
        } 
    },
    {
        "query": "Are there any cardiovascular reactions for adolescent females using DESLORATADINE?",
        "filters": {'sex': 'F', 'age_group': 'adolescent', 'prod_ai': ['DESLORATADINE'], 'reaction_category': 'Cardiovascular'},
        "expected_pts": set()
    },
   
    {
        "query": "Show me musculoskeletal reactions for elderly females using MONTELUKAST.",
        "filters": {'sex': 'F', 'age_group': 'elderly', 'prod_ai': ['MONTELUKAST'], 'reaction_category': 'Musculoskeletal'},
        "expected_pts":set()
    },
    {
        "query": "Are there any adverse effects for a drug not in the database, like 'NONEXISTENTDRUG'?",
        "filters": {'prod_ai': 'NONEXISTENTDRUG'},
        "expected_pts": set()
        
    },
   
    {
        "query": "Report any product issues for children taking DESLORATADINE.",
        "filters": {'age_group': 'child', 'prod_ai': ['DESLORATADINE'], 'reaction_category': 'Product/Administration Issues'},
        "expected_pts": { "expired product administered"
        ,"off label use"
        }
      
    },
    {
        "query": "What are the neurological reactions for adult females using FEXOFENADINE?",
        "filters": {'sex': 'F', 'age_group': 'adult', 'prod_ai': ['FEXOFENADINE'], 'reaction_category': 'Neurological'},
        "expected_pts": {"amnesia"
            ,"aphonia"
            ,"asthenia"
            ,"balance disorder"
            ,"disorientation"
            ,"disturbance in attention"
            ,"dizziness"
            ,"dizziness postural"
            ,"dyskinesia"
            ,"gait disturbance"
            ,"headache"
            ,"insomnia"
            ,"loss of personal independence in daily activities"
            ,"muscle spasms"
            ,"muscle twitching"
            ,"musculoskeletal stiffness"
            ,"myalgia"
            ,"myopathy"
            ,"paraesthesia"
            ,"parosmia"
            ,"polyneuropathy"
            ,"rhabdomyolysis"
            ,"somnolence"}
        
    },
        {
        "query": "Are there any psychiatric adverse events for infants using MONTELUKAST?",
        "filters": {'age_group': 'infant', 'prod_ai': ['MONTELUKAST'], 'reaction_category': 'Psychiatric'},
        "expected_pts": set()
    },
    {
        "query": "List all cardiovascular reactions for elderly patients taking LORATADINE.",
        "filters": {'age_group': 'elderly', 'prod_ai': ['LORATADINE'], 'reaction_category': 'Cardiovascular'},
        "expected_pts": { "angina pectoris"
            ,"arrhythmia"
            ,"atrial fibrillation"
            ,"bradyphrenia"
            ,"cardiac disorder"
            ,"cardiac pacemaker insertion"
            ,"electrocardiogram qt prolonged"
            ,"heart rate increased"
            ,"hypertension"
            ,"nodal arrhythmia"
            ,"palpitations"
            ,"presyncope"
            ,"sinus bradycardia"
            ,"syncope"
            ,"tachycardia"
            ,"tachyphrenia"
            ,"torsade de pointes"
        }
    }
]

In [118]:
queries_for_RAG = [
"What skin reactions are reported for adult females who used desloratadine?"
,"List all dermatological adverse events for males over 60 years old who took FEXOFENADINE."
,"Are there any gastrointestinal reactions reported for children using CETIRIZINE?"
,"What are the common adverse reactions reported for adult patients using LORATADINE, regardless of sex or reaction type?"
,"List any adverse effects of Pseudoephedrine in infants."
,"What are the psychiatric adverse events for young adults using MONTELUKAST?"
,"Report any adverse reactions to a fictional drug called 'ZYX-987' for any patient group."
,"Show respiratory reactions to DESLORATADINE in males under 12 years old."
,"What are the cardiovascular reactions of CETIRIZINE in elderly patients?"
,"Show neurological reactions for adult males taking LORATADINE."
,"Are there any general systemic disorders reported for infants on DESLORATADINE?"
,"What are the musculoskeletal adverse events for females using FEXOFENADINE?"
,"Report product administration issues for any patient taking MONTELUKAST."
,"What are the most common reactions to CETIRIZINE in adolescent females?"
,"Are there any adverse effects for a drug called 'XYZ-123'?"
,"Show me reactions related to the liver for adult males taking LORATADINE."
,"What are the respiratory issues for children using FEXOFENADINE?"
,"List allergic reactions to DESLORATADINE in elderly females."
,"What are the gastrointestinal effects of cetirizine in infants?"
,"Report any psychiatric issues with LORATADINE in children."
,"Are there any cardiovascular reactions for adolescent females using DESLORATADINE?"
,"What are the common reactions for adult patients taking FEXOFENADINE?"
,"List any adverse effects of a combination drug like 'CETIRIZINE\\PSEUDOEPHEDRINE' for any age or sex."
,"Show me musculoskeletal reactions for elderly females using MONTELUKAST."
,"Are there any adverse effects for a drug not in the database, like 'NONEXISTENTDRUG'?"
,"What are the most common reactions reported for adolescents using LORATADINE?"
,"Report any product issues for children taking DESLORATADINE."
,"What are the neurological reactions for adult females using FEXOFENADINE?"
,"Show me general disorders for males under 12 years old taking CETIRIZINE."
,"Are there any psychiatric adverse events for infants using MONTELUKAST?"
,"List all cardiovascular reactions for elderly patients taking LORATADINE."
]

### Batch run of sample queries

In [119]:
generate_responses_for_queries(queries_for_RAG)


--- Generating Responses for Provided Queries ---
RAG components already initialized.

--- Processing Query 1/31: 'What skin reactions are reported for adult females who used desloratadine?' ---
RAG components already initialized.
DEBUG: received query: 'What skin reactions are reported for adult females who used desloratadine?'
DEBUG: lower_query: 'what skin reactions are reported for adult females who used desloratadine?'
DEBUG: Direct whole-word match for 'DESLORATADINE' found and matched to: ['DESLORATADINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE\\PSEUDOEPHEDRINE']
DEBUG: returning filters: {'sex': 'F', 'age_group': 'adult', 'prod_ai': ['DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE\\PSEUDOEPHEDRINE', 'DESLORATADINE'], 'reaction_category': 'Dermatological/Allergic'}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100 avg doc)
DEBUG: Initial semantic retrieval returned 2000 nodes.
DEBUG: Applying metadata filters:

### Evaluation of the RAG model based on various hyperparameter tunings

In [25]:
evaluate_rag_model(test_cases)


--- Starting RAG Model Evaluation
RAG components already initialized.
RAG components initialized for evaluation.

--- Running Test Case 1/25 ---
Query: What skin reactions are reported for adult females who used desloratadine?
Ground Truth Reactions: {'generalised erythema', 'skin dystrophy', 'rash', 'skin wound', 'pruritus generalised', 'dermatitis exfoliative generalised', 'rash erythematous', 'skin plaque', 'guttate psoriasis', 'erythema nodosum', 'hyperhidrosis', 'hypersensitivity', 'pruritus', 'pallor', 'photosensitivity reaction', 'skin ulcer', 'erythema', 'skin lesion', 'skin disorder', 'rash maculo-papular'}
Expected Retrieved PTs (for retrieval eval): {'generalised erythema', 'skin dystrophy', 'rash', 'skin wound', 'pruritus generalised', 'dermatitis exfoliative generalised', 'rash erythematous', 'skin plaque', 'guttate psoriasis', 'erythema nodosum', 'hyperhidrosis', 'hypersensitivity', 'pruritus', 'pallor', 'photosensitivity reaction', 'skin ulcer', 'erythema', 'skin lesion

  result = _qa_chain_global({"query": query})


DEBUG: Initial semantic retrieval returned 2000 nodes.
DEBUG: Applying metadata filters: {'sex': 'F', 'age_group': 'adult', 'prod_ai': ['DESLORATADINE\\PSEUDOEPHEDRINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE'], 'reaction_category': 'Dermatological/Allergic'}
DEBUG: After filtering, 63 nodes remain.
DEBUG: Selected 30 documents (out of 63 filtered) for LLM context.

Model Response:
- erythema
- erythema nodosum
- generalised erythema
- hypersensitivity
- photosensitivity reaction
- pruritus
- rash erythematous
- rash maculo-papular

Sources Used:
Document 1:
  Content snippet: Adverse Drug Reaction Report:
Product Active Ingredient: DESLORATADINE. Drug Name: AERIUS (DESLORATADINE). Patient Age: 58.0 years (adult). Patient Sex: F. Reported Reaction: rash erythematous. Reacti...
  Metadata: {'primaryid': 88102123, 'drugname': 'AERIUS (DESLORATADINE)', 'prod_ai': 'DESLORATADINE', 'pt': 'rash erythematous', 'reaction_category': 'Dermatological/Allergic', 'age': '58.0', 's

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.11
ROUGE-L F1: 0.56
Jaccard Similarity: 0.40

--- Running Test Case 2/25 ---
Query: List all dermatological adverse events for males over 60 years old who took FEXOFENADINE.
Ground Truth Reactions: {'rash pruritic', 'eczema', 'lip swelling', 'acute generalised exanthematous pustulosis', 'skin mass', 'hypersensitivity', 'erythema', 'skin disorder', 'skin irritation', 'pruritus'}
Expected Retrieved PTs (for retrieval eval): {'rash pruritic', 'eczema', 'lip swelling', 'acute generalised exanthematous pustulosis', 'skin mass', 'hypersensitivity', 'erythema', 'skin disorder', 'skin irritation', 'pruritus'}
RAG components already initialized.
DEBUG: received query: 'List all dermatological adverse events for males over 60 years old who took FEXOFENADINE.'
DEBUG: lower_query: 'list all dermatological adverse events for males over 60 years old who took fexofenadine.'
DEBUG: Direct whole-word match for 'FEXOFENADINE' found and matched to: ['FEXOFENADINE HYDROCHL

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.61
ROUGE-L F1: 0.80
Jaccard Similarity: 0.70

--- Running Test Case 3/25 ---
Query: Are there any gastrointestinal reactions reported for children using CETIRIZINE?
Ground Truth Reactions: {'abdominal pain upper', 'gastrooesophageal reflux disease', 'vomiting', 'abdominal pain', 'nausea', 'abdominal discomfort', 'diarrhoea', 'coeliac disease'}
Expected Retrieved PTs (for retrieval eval): {'abdominal pain upper', 'gastrooesophageal reflux disease', 'vomiting', 'abdominal pain', 'nausea', 'abdominal discomfort', 'diarrhoea', 'coeliac disease'}
RAG components already initialized.
DEBUG: received query: 'Are there any gastrointestinal reactions reported for children using CETIRIZINE?'
DEBUG: lower_query: 'are there any gastrointestinal reactions reported for children using cetirizine?'
DEBUG: Direct whole-word match for 'CETIRIZINE' found and matched to: ['CETIRIZINE HYDROCHLORIDE', 'CETIRIZINE HYDROCHLORIDE\\PSEUDOEPHEDRINE HYDROCHLORIDE', 'CAFFEINE\\CETIR

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.32
ROUGE-L F1: 0.70
Jaccard Similarity: 0.62

--- Running Test Case 4/25 ---
Query: List any adverse effects of Pseudoephedrine in infants.
Ground Truth Reactions: set()
Expected Retrieved PTs (for retrieval eval): set()
RAG components already initialized.
DEBUG: received query: 'List any adverse effects of Pseudoephedrine in infants.'
DEBUG: lower_query: 'list any adverse effects of pseudoephedrine in infants.'
DEBUG: Direct whole-word match for 'PSEUDOEPHEDRINE' found and matched to: ['CETIRIZINE HYDROCHLORIDE\\PSEUDOEPHEDRINE HYDROCHLORIDE', 'LORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'FEXOFENADINE HYDROCHLORIDE\\PSEUDOEPHEDRINE HYDROCHLORIDE', 'FEXOFENADINE\\PSEUDOEPHEDRINE', 'DESLORATADINE\\PSEUDOEPHEDRINE']
DEBUG: returning filters: {'age_group': 'infant', 'prod_ai': ['CETIRIZINE HYDROCHLORIDE\\PSEUDOEPHEDRINE HYDROCHLORIDE', 'DESLORATADINE\\PSEUDOEPHEDRINE', 'FEXOFENADINE\\PSEUDOEPHEDRINE', 'LORATADINE\\PSEUDO

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.00
ROUGE-L F1: 0.24
Jaccard Similarity: 0.20

--- Running Test Case 9/25 ---
Query: Show neurological reactions for adult males taking LORATADINE.
Ground Truth Reactions: {'rhabdomyolysis', 'impaired work ability', 'dizziness', 'parkinsonism', 'asthenia', 'paranoia', 'dizziness postural', 'ageusia', 'insomnia', 'muscle oedema', 'headache', 'paraesthesia', 'myalgia', 'dyskinesia', 'anosmia', 'muscle spasms', 'muscular weakness', 'somnolence', 'muscle contractions involuntary', 'musculoskeletal discomfort', 'musculoskeletal chest pain'}
Expected Retrieved PTs (for retrieval eval): {'rhabdomyolysis', 'impaired work ability', 'dizziness', 'parkinsonism', 'asthenia', 'paranoia', 'dizziness postural', 'ageusia', 'insomnia', 'muscle oedema', 'headache', 'paraesthesia', 'myalgia', 'dyskinesia', 'anosmia', 'muscle spasms', 'muscular weakness', 'somnolence', 'muscle contractions involuntary', 'musculoskeletal discomfort', 'musculoskeletal chest pain'}
RAG compone

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.00
ROUGE-L F1: 0.27
Jaccard Similarity: 0.24

--- Running Test Case 10/25 ---
Query: Are there any general systemic disorders reported for infants on DESLORATADINE?
Ground Truth Reactions: set()
Expected Retrieved PTs (for retrieval eval): set()
RAG components already initialized.
DEBUG: received query: 'Are there any general systemic disorders reported for infants on DESLORATADINE?'
DEBUG: lower_query: 'are there any general systemic disorders reported for infants on desloratadine?'
DEBUG: Direct whole-word match for 'DESLORATADINE' found and matched to: ['DESLORATADINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE\\PSEUDOEPHEDRINE']
DEBUG: returning filters: {'age_group': 'infant', 'prod_ai': ['DESLORATADINE\\PSEUDOEPHEDRINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE'], 'reaction_category': 'General/Systemic Disorders'}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100 avg doc)
DEB

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.03
ROUGE-L F1: 0.46
Jaccard Similarity: 0.50

--- Running Test Case 12/25 ---
Query: Report product administration issues for any patient taking MONTELUKAST.
Ground Truth Reactions: set()
Expected Retrieved PTs (for retrieval eval): set()
RAG components already initialized.
DEBUG: received query: 'Report product administration issues for any patient taking MONTELUKAST.'
DEBUG: lower_query: 'report product administration issues for any patient taking montelukast.'
DEBUG: Direct whole-word match for 'MONTELUKAST' found and matched to: ['LEVOCETIRIZINE DIHYDROCHLORIDE\\MONTELUKAST SODIUM']
DEBUG: returning filters: {'prod_ai': ['LEVOCETIRIZINE DIHYDROCHLORIDE\\MONTELUKAST SODIUM'], 'reaction_category': 'Product/Administration Issues'}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100 avg doc)
DEBUG: Initial semantic retrieval returned 2000 nodes.
DEBUG: Applying metadata filters: {'prod_ai': ['LEVOCETIRIZINE DIHYDR

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.18
ROUGE-L F1: 1.00
Jaccard Similarity: 1.00

--- Running Test Case 15/25 ---
Query: List allergic reactions to DESLORATADINE in elderly females.
Ground Truth Reactions: {'pallor', 'eczema', 'hypersensitivity', 'hyperhidrosis', 'pruritus'}
Expected Retrieved PTs (for retrieval eval): {'pallor', 'eczema', 'hypersensitivity', 'hyperhidrosis', 'pruritus'}
RAG components already initialized.
DEBUG: received query: 'List allergic reactions to DESLORATADINE in elderly females.'
DEBUG: lower_query: 'list allergic reactions to desloratadine in elderly females.'
DEBUG: Exact match of raw query 'desloratadine' to normalized 'desloratadine'.
DEBUG: Extracted keyword-based drug 'desloratadine' and matched to: ['DESLORATADINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE\\PSEUDOEPHEDRINE']
DEBUG: returning filters: {'sex': 'F', 'age_group': 'elderly', 'prod_ai': ['DESLORATADINE\\PSEUDOEPHEDRINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE'

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 1.00
ROUGE-L F1: 1.00
Jaccard Similarity: 1.00

--- Running Test Case 16/25 ---
Query: Are there any general disorders for adult males taking MONTELUKAST?
Ground Truth Reactions: set()
Expected Retrieved PTs (for retrieval eval): set()
RAG components already initialized.
DEBUG: received query: 'Are there any general disorders for adult males taking MONTELUKAST?'
DEBUG: lower_query: 'are there any general disorders for adult males taking montelukast?'
DEBUG: Direct whole-word match for 'MONTELUKAST' found and matched to: ['LEVOCETIRIZINE DIHYDROCHLORIDE\\MONTELUKAST SODIUM']
DEBUG: returning filters: {'sex': 'M', 'age_group': 'adult', 'prod_ai': ['LEVOCETIRIZINE DIHYDROCHLORIDE\\MONTELUKAST SODIUM']}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100 avg doc)
DEBUG: Initial semantic retrieval returned 2000 nodes.
DEBUG: Applying metadata filters: {'sex': 'M', 'age_group': 'adult', 'prod_ai': ['LEVOCETIRIZINE DIHYDRO

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.09
ROUGE-L F1: 0.67
Jaccard Similarity: 0.50

--- Running Test Case 18/25 ---
Query: Report any psychiatric issues with LORATADINE in children.
Ground Truth Reactions: {'depressed level of consciousness', 'depression', 'confusional state', 'psychomotor hyperactivity', 'abnormal behaviour', 'aggression', 'shock', 'antisocial behaviour', 'altered state of consciousness', 'suicide attempt', 'agitation', 'anxiety', 'hallucination', 'delirium', 'panic reaction', 'self-injurious ideation', 'suicidal ideation', 'loss of consciousness'}
Expected Retrieved PTs (for retrieval eval): {'depressed level of consciousness', 'depression', 'confusional state', 'psychomotor hyperactivity', 'abnormal behaviour', 'aggression', 'shock', 'antisocial behaviour', 'altered state of consciousness', 'suicide attempt', 'agitation', 'anxiety', 'hallucination', 'delirium', 'panic reaction', 'self-injurious ideation', 'suicidal ideation', 'loss of consciousness'}
RAG components alrea

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.09
ROUGE-L F1: 0.54
Jaccard Similarity: 0.44

--- Running Test Case 19/25 ---
Query: Are there any cardiovascular reactions for adolescent females using DESLORATADINE?
Ground Truth Reactions: set()
Expected Retrieved PTs (for retrieval eval): set()
RAG components already initialized.
DEBUG: received query: 'Are there any cardiovascular reactions for adolescent females using DESLORATADINE?'
DEBUG: lower_query: 'are there any cardiovascular reactions for adolescent females using desloratadine?'
DEBUG: Direct whole-word match for 'DESLORATADINE' found and matched to: ['DESLORATADINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE\\PSEUDOEPHEDRINE']
DEBUG: returning filters: {'sex': 'F', 'age_group': 'adolescent', 'prod_ai': ['DESLORATADINE\\PSEUDOEPHEDRINE', 'DESLORATADINE\\PSEUDOEPHEDRINE SULFATE', 'DESLORATADINE'], 'reaction_category': 'Cardiovascular'}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 1.00
ROUGE-L F1: 1.00
Jaccard Similarity: 1.00

--- Running Test Case 23/25 ---
Query: What are the neurological reactions for adult females using FEXOFENADINE?
Ground Truth Reactions: {'rhabdomyolysis', 'musculoskeletal stiffness', 'dizziness', 'amnesia', 'muscle twitching', 'disturbance in attention', 'disorientation', 'asthenia', 'balance disorder', 'dizziness postural', 'myopathy', 'insomnia', 'polyneuropathy', 'gait disturbance', 'headache', 'paraesthesia', 'parosmia', 'myalgia', 'loss of personal independence in daily activities', 'muscle spasms', 'aphonia', 'somnolence', 'dyskinesia'}
Expected Retrieved PTs (for retrieval eval): {'rhabdomyolysis', 'musculoskeletal stiffness', 'dizziness', 'amnesia', 'muscle twitching', 'disturbance in attention', 'disorientation', 'asthenia', 'balance disorder', 'dizziness postural', 'myopathy', 'insomnia', 'polyneuropathy', 'gait disturbance', 'headache', 'paraesthesia', 'parosmia', 'myalgia', 'loss of personal in

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.00
ROUGE-L F1: 0.36
Jaccard Similarity: 0.35

--- Running Test Case 24/25 ---
Query: Are there any psychiatric adverse events for infants using MONTELUKAST?
Ground Truth Reactions: set()
Expected Retrieved PTs (for retrieval eval): set()
RAG components already initialized.
DEBUG: received query: 'Are there any psychiatric adverse events for infants using MONTELUKAST?'
DEBUG: lower_query: 'are there any psychiatric adverse events for infants using montelukast?'
DEBUG: Direct whole-word match for 'MONTELUKAST' found and matched to: ['LEVOCETIRIZINE DIHYDROCHLORIDE\\MONTELUKAST SODIUM']
DEBUG: returning filters: {'age_group': 'infant', 'prod_ai': ['LEVOCETIRIZINE DIHYDROCHLORIDE\\MONTELUKAST SODIUM'], 'reaction_category': 'Psychiatric'}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100 avg doc)
DEBUG: Initial semantic retrieval returned 2000 nodes.
DEBUG: Applying metadata filters: {'age_group': 'infant', 'prod_ai'

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 1.00
BLEU Score: 0.01
ROUGE-L F1: 0.38
Jaccard Similarity: 0.41

--- Overall RAG Model Evaluation Results 
Average Retrieval Precision: 0.52
Average Retrieval Recall: 0.85
Average Hit Rate: 1.00
Average BERTScore F1: 1.00
Average ROUGE-L F1: 0.80
Average BLEU Score: 0.62
Average Jaccard Score: 0.57


([1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0],
 [0.6,
  1.0,
  0.875,
  1.0,
  1.0,
  1.0,
  1.0,
  0.2,
  0.42857142857142855,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.75,
  0.5,
  1.0,
  1.0,
  1.0,
  1.0,
  0.5217391304347826,
  1.0,
  0.4117647058823529],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0],
 [0.56,
  0.8,
  0.6956521739130436,
  1.0,
  1.0,
  1.0,
  1.0,
  0.23529411764705882,
  0.2702702702702703,
  1.0,
  0.4615384615384615,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.6666666666666666,
  0.5416666666666666,
  1.0,
  1.0,
  1.0,
  1.0,
  0.35555555555555557,
  1.0,
  0.37837837837837834],
 [0.10896207834315334,
  0.6065306597126334,


# UI Block

Gradio-based light weight UI

In [26]:
def clean_and_format_response(response_text):
    # formatting the list of responses
    if not response_text.strip():
        return "No reactions found."

    lines = [line.strip("-• ").capitalize() for line in response_text.split("\n") if line.strip()]
    if not lines:
        return "No reactions found."

    return "### Reported Adverse Reactions:\n\n" + "\n".join(f"- {line}" for line in lines)

def query_interface(user_input):
    try:
        response, sources, _ = run_rag_query(user_input, progress=None)

        formatted_response = clean_and_format_response(response)
        
        # displaying the priamry id of the reports for response transparency
        primary_ids = re.findall(r"'primaryid':\s*(\d+)", sources)
        if primary_ids:
            primary_str = ", ".join(f"Primary ID: {pid}" for pid in primary_ids)
            source_snippets = f"Retrieved from cases: {primary_str}"
    
        formatted_sources = f"### Example Source Documents:\n\n{source_snippets}" if source_snippets else "No relevant sources found."

        return formatted_response, formatted_sources
    except Exception as e:
        return f"Error: {str(e)}", "No sources returned."

with gr.Blocks() as demo:
    gr.Markdown("# 🔍 FAERS Adverse Reaction Explorer (RAG Model)")
    gr.Markdown("Ask a question about antihistamines and adverse reactions (e.g., drug + age group + sex).")

    with gr.Row():
        user_input = gr.Textbox(label="Enter Your Query", placeholder="e.g. What adverse reactions are seen in elderly males taking desloratadine?")

    with gr.Row():
        output_box = gr.Markdown(label="RAG Model Response")
        source_box = gr.Markdown(label="Supporting Source Snippets")

    submit_button = gr.Button("Generate Answer")

    submit_button.click(fn=query_interface, inputs=[user_input], outputs=[output_box, source_box])

demo.launch()

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.




RAG components already initialized.
DEBUG: received query: 'List alll cardiovascular reactions for elderly patients taking LORATADINE'
DEBUG: lower_query: 'list alll cardiovascular reactions for elderly patients taking loratadine'
DEBUG: Direct whole-word match for 'LORATADINE' found and matched to: ['LORATADINE', 'LORATADINE\\PSEUDOEPHEDRINE SULFATE']
DEBUG: returning filters: {'age_group': 'elderly', 'prod_ai': ['LORATADINE', 'LORATADINE\\PSEUDOEPHEDRINE SULFATE'], 'reaction_category': 'Cardiovascular'}
DEBUG: Calculated dynamic desired_k: 30 (based on 4096 n_ctx, 500 prompt, 500 response, 100 avg doc)
DEBUG: Initial semantic retrieval returned 2000 nodes.
DEBUG: Applying metadata filters: {'age_group': 'elderly', 'prod_ai': ['LORATADINE', 'LORATADINE\\PSEUDOEPHEDRINE SULFATE'], 'reaction_category': 'Cardiovascular'}
DEBUG: After filtering, 36 nodes remain.
DEBUG: Selected 30 documents (out of 36 filtered) for LLM context.
DEBUG: Allowed PTs from retrieved documents: {'palpitations',