In [6]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.groq import Groq  # Using Groq for speed
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import KeywordNodePostprocessor
import os


In [7]:
from dotenv import load_dotenv
load_dotenv()
# 1. Configuration =============================================================
def configure_settings():
    """Set up global settings for the RAG system"""
    Settings.llm = Groq(model="llama-3.1-8b-instant", api_key=os.getenv("GROQ_API_KEY"))
    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    Settings.chunk_size = 512  # Optimal for medical texts
    Settings.chunk_overlap = 50  # Maintain context between chunks

In [8]:
# 2. Document Processing =======================================================
def load_and_index_documents():
    """Load PDFs and create search index"""
    documents = SimpleDirectoryReader(
        input_dir="medical_papers",
        required_exts=[".pdf"],
        recursive=True
    ).load_data()
    
    return VectorStoreIndex.from_documents(
        documents,
        show_progress=True
    )

In [9]:
# 3. Query Engine ==============================================================
def create_query_engine(index):
    """Create optimized query engine with medical focus"""
    return index.as_query_engine(
        similarity_top_k=5,
        node_postprocessors=[
            KeywordNodePostprocessor(
                required_keywords=["treatment", "study", "patient"],
                exclude_keywords=["animal", "in vitro"]
            )
        ],
        response_mode="compact"  # Best for factual responses
    )

In [10]:
# 4. Main Application ==========================================================
def main():
    configure_settings()
    
    # Check if documents exist
    if not os.path.exists("medical_papers") or not os.listdir("medical_papers"):
        print("Error: No documents found in 'medical_papers' directory")
        return
    
    print("Building knowledge index...")
    index = load_and_index_documents()
    query_engine = create_query_engine(index)
    
    # Simple query interface
    print("\nMedical Research Assistant (type 'exit' to quit)")
    while True:
        query = input("\nEnter your medical question: ")
        if query.lower() == "exit":
            break
            
        response = query_engine.query(query)
        print(f"\nAnswer: {response}\n")
        print("Sources:")
        for node in response.source_nodes[:3]:  # Show top 3 sources
            print(f"- {node.text[:150]}... (Score: {node.score:.2f})")

if __name__ == "__main__":
    main()

Building knowledge index...


Parsing nodes: 100%|██████████| 141/141 [00:00<00:00, 179.36it/s]
Generating embeddings: 100%|██████████| 531/531 [01:02<00:00,  8.47it/s]



Medical Research Assistant (type 'exit' to quit)

Answer: Recent research has shown that desmopressin, a synthetic analogue of vasopressin, is effective in treating central diabetes insipidus. It can be administered orally, and its pharmacokinetics have been studied in both children and adults. Additionally, orally disintegrating tablets of desmopressin have been found to be safe and effective in patients with central diabetes insipidus. Furthermore, ultralow doses of vasopressin have also been used in the management of diabetes insipidus, and studies have shown that they can be effective in reducing symptoms.

Sources:
- There remains a need for improved utilisation of existing therapies, particularly
parenteral prostacyclin pathway agents (PPAs), as well as the target... (Score: 0.70)
- https://doi.org/10.10 
56/NEJM197603042941001.
81. Ward MK, Fraser TR. DDAVP in treatment of vasopressin-
sensitive diabetes insipidus. Br Med J 1974;... (Score: 0.69)
- J Clin 
Endocrinol Metab 2016