In [23]:
# Cell 1: Install Libraries
!pip install -q langchain langchain-groq faiss-cpu sentence-transformers gradio

# Cell 2: Import Libraries
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq  # ‚úÖ Using Groq instead
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import gradio as gr

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
# Cell 3: Configure Groq API
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["GROQ_API_KEY"] = user_secrets.get_secret("GROQ_API_KEY")
print("‚úì API Key loaded")

# Cell 4: Load Dataset
df = pd.read_csv('/kaggle/input/medicaltranscriptions/mtsamples.csv')
print(f"‚úì Dataset loaded: {df.shape}")


‚úì API Key loaded
‚úì Dataset loaded: (4999, 6)


In [30]:
df = df.dropna(subset=['transcription'])
df['medical_specialty'] = df['medical_specialty'].fillna('Unknown')
df['description'] = df['description'].fillna('')
df['full_text'] = (
    "Specialty: " + df['medical_specialty'] + "\n" +
    "Case: " + df['description'] + "\n" +
    "Content: " + df['transcription']
)
print(f"‚úì Cleaned: {len(df)} records")

‚úì Cleaned: 4966 records


In [31]:
#making chucks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

docs = []
for idx, row in df.iterrows():
    for chunk in splitter.split_text(row['full_text']):
        docs.append(Document(
            page_content=chunk,
            metadata={
                'specialty': row['medical_specialty'],
                'description': row['description'][:100]
            }
        ))

print(f"‚úì Created {len(docs)} chunks")

‚úì Created 40605 chunks


In [32]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("Building FAISS index...")
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("/kaggle/working/medical_faiss")
print("‚úì Vector store saved!")

Building FAISS index...
‚úì Vector store saved!


In [33]:
# Cell 8: Load and Test Retriever
vectorstore = FAISS.load_local(
    "/kaggle/working/medical_faiss",
    embeddings,
    allow_dangerous_deserialization=True
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
print("‚úì Retriever ready")

‚úì Retriever ready


In [34]:
# Cell 9: Initialize Gemini with stable model
llm = ChatGroq(
    model="llama-3.3-70b-versatile",  # Fast, accurate, free
    temperature=0.2,
    max_tokens=512
)

# Test it
response = llm.invoke("Say: API Working!")
print(f"‚úì Groq LLM: {response.content}")


‚úì Groq LLM: API Working!


In [35]:
# Cell 10: Build RAG Pipeline
template = """Use the medical context to answer the question accurately.
Cite the medical specialty when relevant.

Context: {context}

Question: {question}

Answer:"""

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"]
        )
    }
)
print("‚úì RAG pipeline ready")


‚úì RAG pipeline ready


In [36]:
result = qa_chain({"query": "What are symptoms of diabetes?"})
print("\n=== TEST QUERY ===")
print("Q: What are symptoms of diabetes?")
print(f"\nA: {result['result']}")
print(f"\nSources: {len(result['source_documents'])} documents")
for i, doc in enumerate(result['source_documents'][:2]):
    print(f"\n{i+1}. {doc.metadata['specialty']}")
    print(f"   {doc.page_content[:150]}...")

  result = qa_chain({"query": "What are symptoms of diabetes?"})



=== TEST QUERY ===
Q: What are symptoms of diabetes?

A: According to the medical specialty of Endocrinology, the symptoms of diabetes can vary depending on the type of diabetes. Common symptoms of diabetes include:

1. Polyuria (frequent urination) - as mentioned in the patient's review of systems, the patient does not have polyuria.
2. Polydipsia (excessive thirst)
3. Polyphagia (increased hunger)
4. Weight loss
5. Fatigue
6. Blurred vision
7. Slow healing of cuts and wounds
8. Tingling or numbness in the hands and feet

In the case of the patient, she has a family history of type II diabetes (her father and grandfather had type II diabetes) and her son has type I diabetes. She herself has type II diabetes mellitus, but the only symptom mentioned related to her diabetes is that she only checks her blood sugars in the morning.

It's worth noting that the patient's symptoms of right shoulder pain are not typically associated with diabetes, and would be more relevant to the medical spe

In [37]:
eval_queries = [
    "What are the symptoms of diabetes?",
    "How is hypertension diagnosed?",
    "What is the treatment for pneumonia?",
    "Describe the procedure for colonoscopy",
    "What are complications of coronary artery disease?",
    "What medications are used for asthma?",
    "Explain the signs of heart failure",
    "What is the difference between Type 1 and Type 2 diabetes?",
    "How is breast cancer screened?",
    "What are risk factors for stroke?",
    "Describe symptoms of COPD",
    "How is appendicitis diagnosed?",
    "What are treatments for migraine headaches?",
    "Explain the procedure for knee replacement surgery",
    "What are signs of sepsis?",
    "How is depression treated?",
    "What causes kidney stones?",
    "Describe the stages of Alzheimer's disease",
    "How is tuberculosis diagnosed?",
    "What are treatments for rheumatoid arthritis?",
    "Explain the procedure for cataract surgery",
    "What are symptoms of thyroid disorders?",
    "How is anemia diagnosed?",
    "What are risk factors for diabetes?",
    "Describe treatment options for chronic back pain",
    "How is hepatitis C treated?",
    "What are symptoms of gastroesophageal reflux disease?",
    "Explain the procedure for hip replacement",
    "What causes chronic kidney disease?",
    "How is multiple sclerosis diagnosed?",
    "What are treatments for anxiety disorders?"
]

pd.DataFrame({'query': eval_queries}).to_csv(
    '/kaggle/working/evaluation_queries.csv', 
    index=False
)
print(f"‚úì Saved {len(eval_queries)} evaluation queries")

‚úì Saved 31 evaluation queries


In [38]:
# Cell 13: Run Evaluation
print("\n=== RUNNING EVALUATION ===")
results = []
for i, query in enumerate(eval_queries[:10], 1):  # Start with 10
    print(f"{i}/10: {query[:50]}...")
    response = qa_chain({"query": query})
    results.append({
        'query': query,
        'answer': response['result'],
        'num_sources': len(response['source_documents']),
        'specialties': ', '.join([
            doc.metadata['specialty'] 
            for doc in response['source_documents']
        ])
    })

eval_df = pd.DataFrame(results)
eval_df.to_csv('/kaggle/working/evaluation_results.csv', index=False)
print(f"\n‚úì Evaluation complete! Results saved.")
print(eval_df[['query', 'num_sources']].head())


=== RUNNING EVALUATION ===
1/10: What are the symptoms of diabetes?...
2/10: How is hypertension diagnosed?...
3/10: What is the treatment for pneumonia?...
4/10: Describe the procedure for colonoscopy...
5/10: What are complications of coronary artery disease?...
6/10: What medications are used for asthma?...
7/10: Explain the signs of heart failure...
8/10: What is the difference between Type 1 and Type 2 d...
9/10: How is breast cancer screened?...
10/10: What are risk factors for stroke?...

‚úì Evaluation complete! Results saved.
                                               query  num_sources
0                 What are the symptoms of diabetes?            3
1                     How is hypertension diagnosed?            3
2               What is the treatment for pneumonia?            3
3             Describe the procedure for colonoscopy            3
4  What are complications of coronary artery dise...            3


In [39]:
# Cell 14: Deploy Gradio App
def ask_question(question):
    result = qa_chain({"query": question})
    sources = "\n\n".join([
        f"**Source {i+1}** ({doc.metadata['specialty']})\n{doc.page_content[:200]}..."
        for i, doc in enumerate(result['source_documents'])
    ])
    return result['result'], sources

demo = gr.Interface(
    fn=ask_question,
    inputs=gr.Textbox(
        label="Medical Question",
        placeholder="Ask a medical question...",
        lines=2
    ),
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="Source Documents", lines=8)
    ],
    title="üè• Medical RAG QA System",
    description="Ask questions based on medical transcriptions database",
    examples=[
        "What are symptoms of diabetes?",
        "How is hypertension treated?",
        "Explain colonoscopy procedure"
    ],
    theme=gr.themes.Soft()
)

demo.launch(share=True)  # Creates public link!

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://ba0a162133a7cad905.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


