## 📥 Step 1: Load PDF Documents
We use `DirectoryLoader` with `PyPDFLoader` to load all PDF files in the `data/` directory.

In [8]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

DATA_PATH = "data/"

def load_pdf_files(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

documents = load_pdf_files(DATA_PATH)
print(f"Loaded {len(documents)} PDF pages")

Loaded 377 PDF pages


## 📘 Step 2: Chunk PDF Text
We split text into ~400 token chunks with 50 token overlap using `RecursiveCharacterTextSplitter`.

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
chunks = splitter.split_documents(documents)
print(f"Total Chunks: {len(chunks)}")

Total Chunks: 1971


## Step 3: Create Embeddings and Vector Store

In [10]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embedding_model)
print(" FAISS vector store created")

  from .autonotebook import tqdm as notebook_tqdm


 FAISS vector store created


In [11]:

from dotenv import load_dotenv, find_dotenv
import os

# Load .env file
load_dotenv(find_dotenv())

# Fetch Groq API key
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Show secure confirmation
if GROQ_API_KEY:
    print("✅ GROQ_API_KEY Found:", True)
    print("🔐 First 10 characters:", GROQ_API_KEY[:10] + "********")
else:
    print("❌ GROQ_API_KEY not found. Please check your .env file.")


✅ GROQ_API_KEY Found: True
🔐 First 10 characters: gsk_I2cO3q********


## Step 4: Load LLM from grog

In [19]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv, find_dotenv
import os

# Load .env
load_dotenv(find_dotenv())

# Load Groq API key
groq_api_key = os.getenv("GROQ_API_KEY")
assert groq_api_key, "❌ GROQ_API_KEY not found in .env"

# Load LLM (e.g. Mixtral, Gemma, LLaMA3)
llm = ChatGroq(
    model_name="gemma-7b-it",  # or use "llama3-70b-8192", "gemma-7b-it"
    api_key=groq_api_key
)


## Step 5: Prompt Template + QA Chain

In [17]:
from dotenv import load_dotenv, find_dotenv
import os
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq

# Load environment
load_dotenv(find_dotenv())
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Load Groq LLM
llm = ChatGroq(
    model_name="gemma2-9b-it",  # You can use llama3-70b-8192 or gemma-7b-it
    api_key=GROQ_API_KEY
)

# Create your prompt
prompt_template = PromptTemplate(
    template="""
Use the information in the context below to answer the question.
If unsure, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
""",
    input_variables=["context", "question"]
)

# QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)


## 🧪 Step 6: Run Sample Queries

In [18]:
sample_questions = [
    "Give me the correct coded classification for the following diagnosis: Recurrent depressive disorder, currently in remission",
    "What are the diagnostic criteria for Obsessive-Compulsive Disorder (OCD)?"
]

for query in sample_questions:
    print(f"\n🔎 Query: {query}")
    result = qa_chain.invoke({"query": query})  # Uses Groq-backed LLM
    print("✅ Answer:", result["result"])

    print("📌 Context Snippets:")
    for i, doc in enumerate(result["source_documents"], 1):
        snippet = doc.page_content.strip().replace("\n", " ")[:150]
        print(f"- Chunk {i}: {snippet}...")



🔎 Query: Give me the correct coded classification for the following diagnosis: Recurrent depressive disorder, currently in remission
✅ Answer: F33.4  

📌 Context Snippets:
- Chunk 1: MENTAL AND BEHAVIOURAL DISORDERS F33.4 Recurrent depressive disorder, currently in remission Diagnostic guidelines For a definite diagnosis: (a) the c...
- Chunk 2: recurrent depressive disorder (F33.-). These grades of severity are specified to cover a wide range of clinical states that are encountered in differe...
- Chunk 3: .10 Recurrent brief depressive disorder F38.8 Other specified mood [affective] disorders F39 Unspecified mood [affective] disorder 111...

🔎 Query: What are the diagnostic criteria for Obsessive-Compulsive Disorder (OCD)?
✅ Answer: For a definite diagnosis, obsessional symptoms or compulsive acts, or both, must be present on most days for at least 2 successive  

📌 Context Snippets:
- Chunk 1: underlying personality. Onset is usually in childhood or early adult life. The course is 