# RAG Medical Research
### with Groq


In [56]:
# Import necessary libraries
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint
import warnings
from langchain_groq import ChatGroq
from langchain.prompts.prompt import PromptTemplate

# Load environment variables
load_dotenv()

True

In [57]:
warnings.filterwarnings("ignore")

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

---
## Data Ingestion

In [58]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path

def load_multiple_pdfs_from_folders(folder_paths):
    """
    Load text data from all PDF files in multiple folders.

    Args:
        folder_paths (list): List of folder paths to search for PDFs.

    Returns:
        list: List of loaded documents.
    """
    documents = []

    for folder_path in folder_paths:
        pdf_folder = Path(folder_path)

        # Find all PDF files in the folder
        for pdf_file in pdf_folder.glob("*.pdf"):
            print(f"Loading {pdf_file.name}...")
            loader = PyPDFLoader(file_path=str(pdf_file))
            documents.extend(loader.load())

    return documents

# Example usage:
folder_paths = [
    r"C:\Users\peter\Desktop\ds_ai\repo_folder\nutrition-ai-assistant\data\raw\Parkinson",
    r"C:\Users\peter\Desktop\ds_ai\repo_folder\nutrition-ai-assistant\data\raw\MS",
    # Add more folder paths as needed
]

# Load all PDFs from the specified folders
medi_docs = load_multiple_pdfs_from_folders(folder_paths=folder_paths)
print(f"Loaded {len(medi_docs)} pages total")
print(medi_docs[0].page_content)


Loading 10-61474-ncs-2025-00004.pdf...
Loading diet-and-parkinsons.pdf...
Loading Diet_Guide_Update_7.1.21.pdf...
Loading Eating-Well-with-Parkinsons-Disease.pdf...
Loading Nutrition-Jessica-Schroeder.pdf...
Loading Nutrition-Parkinsons_A4Manual_Aug2021.pdf...
Loading Nutrition-PD-2.pdf...
Loading PARKINSON1614-Guideline-Nutrition-A4-ENG.pdf...
Loading 250128_nmss-nutrition-guide-en.pdf...
Loading Diet and MS (Dec 2024) WEB.pdf...
Loading modifiable-lifestyle-factors-and-ms-a-guide-for-health-professionals.pdf...
Loading Motivator_SF2023.pdf...
Loading MSology_Diet-and-Nutrition_Booklet-EN_V3.pdf...
Loading MSP-Nutrition-Toolkit-Multiple-Sclerosis-1.pdf...
Loading Nutrition-and-MS.pdf...
Loaded 493 pages total
Copyright Â© 2025 Author(s). This is an Open Access article distributed under the terms of the Creative Commons Attribution-
Noncommercial 4.0 International License (CC BY-NC 4.0), permitting all non-commercial use, distribution, and reproduction 
in any medium, provided the orig

---

## Document Chunking


In [59]:
# Import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Example chunking function
def split_documents(documents, chunk_size=300, chunk_overlap=50):
    """
    Splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    
    # Just to add id for etch chunks to map it later 
    for i, chunk in enumerate(chunks):
         chunk.metadata.update({
        "id": f"chunk_{i}",
    })
    
    return chunks

In [60]:
# Execute your chunking function and display results here
medi_chunks = split_documents(medi_docs)


---

## Embedding and Storage


In [61]:
# Import libraries
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
import numpy as np

# Example function for embeddings and storage

def create_embedding_vector_db(chunks, db_name):
    """
    This function uses the open-source embedding model HuggingFaceEmbeddings 
    to create embeddings and store those in a VectorStore called FAISS, 
    which allows for efficient similarity search
    """
    # instantiate embedding model
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2',
        encode_kwargs={"normalize_embeddings": True}
    )
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding,
        distance_strategy=DistanceStrategy.COSINE  # or DistanceStrategy.DOT or DistanceStrategy.L2 
        
    )
    # save VectorStore locally
    vectorstore.save_local(f"../vector_databases/vector_db_{db_name}")
    return vectorstore

In [None]:
# Generate embeddings and save them locally
all_embedding=create_embedding_vector_db(chunks=medi_chunks, db_name="medi")

---

## Retrieval from FAISS

In [63]:
# Implement retrieval logic from your FAISS database
def retrieve_from_vector_db(vector_db_path):
    """
    this function splits out a retriever object from a local VectorStore
    """
    # instantiate embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2',
        encode_kwargs={"normalize_embeddings": True}
    )
    react_vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True,
        distance_strategy=DistanceStrategy.COSINE
    )
    retriever = react_vectorstore.as_retriever()
    return retriever ,react_vectorstore

# Load the retriever and index
react_retriever,react_vectorstore = retrieve_from_vector_db("../vector_databases/vector_db_medi")
type(react_retriever),type(react_vectorstore)

(langchain_core.vectorstores.base.VectorStoreRetriever,
 langchain_community.vectorstores.faiss.FAISS)

In [64]:
# Test your retrieval system with queries
query="""
what is import for diets of people with Parkinsons diseases?
"""

In [65]:
react_retriever.get_relevant_documents(query,k=3)

[Document(id='104ca06e-1ae7-473d-aaa1-db4047044658', metadata={'source': 'C:\\Users\\peter\\Desktop\\ds_ai\\repo_folder\\nutrition-ai-assistant\\data\\raw\\Parkinson\\Nutrition-Parkinsons_A4Manual_Aug2021.pdf', 'page': 42, 'id': 'chunk_1109'}, page_content='Nutrition and Parkinsonâ€™s  // 41\nTake Charge of \nParkinsonâ€™s \nThis Diet and Nutrition book is designed for people with Parkinsonâ€™s, carers and \nhealth professionals. It is written by a registered dietician who specialises in the \nnutritional needs unique to people with Parkinsonâ€™s.\nThe printing of'),
 Document(id='f123cb48-956b-4a18-a64a-6493f40a6793', metadata={'source': 'C:\\Users\\peter\\Desktop\\ds_ai\\repo_folder\\nutrition-ai-assistant\\data\\raw\\Parkinson\\Diet_Guide_Update_7.1.21.pdf', 'page': 11, 'id': 'chunk_618'}, page_content="12\nThe Michael J. Fox Foundation for Parkinson's Research  |  Parkinsonâ€™s Disease and Diet: A Practical Guide\n + Olive oil: main cooking oil\n + Whole grains: three servings per 

---

## Connecting Retrieval with LLM

In [66]:
system_prompt = """
You are NutriRetrieve, a data extraction and structuring engine for nutritional and medical information from scientific sources.

**Your Task:**
Extract and structure medical/nutritional data from the provided context to support downstream LLM processing.

**Context:**
{context}

**User Query:**
{input}

---

### Extraction Guidelines

1. **Extract Medical & Dietary Data**
   - Medical conditions â†’ dietary goals, contraindications, recommended ingredients
   - Allergies/intolerances â†’ safe substitutes and warnings
   - Source citations for each recommendation

2. **Output Format (Clear Text)**

**Medical Recommendations:**
For each condition identified, provide:
- Condition name
- Dietary goals (what to eat and why)
- Contraindications (what to avoid and why)
- Recommended ingredients
- Source citations

**Allergies & Intolerances:**
For each allergen, list:
- Allergen name
- Safe substitutes
- Warnings and precautions

**Nutritional Targets:**
- Macronutrients: target grams for carbs, protein, fat
- Micronutrients: specific vitamins and minerals with target values
- Hydration: recommended daily water intake in ml

**Recommended Ingredients:**
For each ingredient, provide:
- Ingredient name
- Health benefits
- Preparation methods
- Recommended portion size

3. **Validation Rules**
   - Only include data supported by provided context
   - If information is missing, note it clearly (e.g., "Not specified in provided sources")
   - Use clear section headers and bullet points for readability
   - Include citations and sources for all recommendations
"""

In [67]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.prompts import ChatPromptTemplate

# Write a function to create retrieval and document processing chains
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=ChatPromptTemplate.from_template(system_prompt)
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain


react_retrieval_chain = connect_chains(react_retriever)


In [68]:
# Invoke your chain with a sample question
output = react_retrieval_chain.invoke(
    {"input": "give me a nutritional advise. I have Parkinsons and what a healthy diet."}
)
type(output) , output.keys() 
print(output['answer'])

**Medical Recommendations:**

### Parkinson's Disease

* **Condition name:** Parkinson's Disease
* **Dietary goals:**
	+ Eat a balanced diet that includes whole grains, fruits, vegetables, lean proteins, and healthy fats.
	+ Include olive oil as the main cooking oil.
	+ Consume three servings of whole grains per week (e.g., whole grain bread or cereal, oats, brown rice, quinoa).
* **Contraindications:**
	+ Avoid foods high in saturated and trans fats, added sugars, and sodium.
	+ Limit or avoid foods that can exacerbate symptoms, such as caffeine, spicy foods, and heavy metals (e.g., lead, mercury).
* **Recommended ingredients:**
	+ Olive oil: rich in healthy fats, may help reduce inflammation and improve symptoms.
	+ Whole grains: rich in fiber, vitamins, and minerals, may help regulate blood sugar and improve gut health.
* **Source citations:**
	+ The Michael J. Fox Foundation for Parkinson's Research (MJFF) - "Parkinsonâ€™s Disease and Diet: A Practical Guide"

**Allergies & Intoler

---

## Interactive Chat System


In [69]:
#medi_retriever = retrieve_from_vector_db("../vector_databases/vector_db_medi")

#medi_retrieval_chain = connect_chains(medi_retriever[0])

In [70]:
# Define your interactive chat querying function
#def print_output(
#    inquiry,
#    retrieval_chain=react_retrieval_chain
#):
#    result = retrieval_chain.invoke({"input": inquiry})
#    print(result['answer'].strip("\n"))

In [71]:
# Run and test your interactive chat system
#print_output("wie sollten sich menschen mit ALS ernÃ¤hren?")

In [72]:
# Define your interactive chat querying function
#def chat_with_rag(chain):
#    """
#    Interactive function to chat with the RAG system.
#    """
#    print("Welcome to the RAG Chat! Type 'exit' to quit.\n")
#    while True:
#        user_input = input("ðŸ§‘ You: ")
#        if user_input.lower() in ["exit", "quit"]:
#            print("ðŸ‘‹ Exiting the chat. Goodbye!")
#            break
#        try:
#            result = chain.invoke({"input": user_input})
#            print(f"ðŸ¤– RAG Answer: {result['answer']}\n")
#        except Exception as e:
#            print(f" Error: {e}\n")


In [73]:
# Run your interactive chat
#chat_with_rag(react_retrieval_chain)