## Download and Install necessary packages, libraries, frameworks and dependencies.

In [None]:
!pip install langchain_community langchainhub chromadb langchain langchain-huggingface

## To download the dataset:
1. Go to kaggle website and get the access token
2. Download kaggle.json file
3. Upload the file here
4. With following lines of the code, move the 'kaggle.json' file into '/.kaggle/' folder.

In [1]:

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


mv: cannot stat 'kaggle.json': No such file or directory


## Dataset downloading:
Create a folder "data" and download and store the CSV file for comprehensive medical QnA dataset in it.

In [2]:
import os
from zipfile import ZipFile

# ensure Kaggle API is installed
os.system("pip install kaggle")

# Kaggle dataset slug (you can replace with another)
DATASET_SLUG = "thedevastator/comprehensive-medical-q-a-dataset"

# Download dataset
os.system(f"kaggle datasets download -d {DATASET_SLUG} -p ./data/")

# Unzip
zip_path = "./data/comprehensive-medical-q-a-dataset.zip"
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("./data/")

print("Dataset downloaded & extracted to ./data/")

Dataset downloaded & extracted to ./data/


## Data Pre-processing:
Pre-process the dowloaded CSV file. Segregate 'question' and 'answer' columns and drop 'NULL' cells.

In [None]:
import pandas as pd

def preprocess_data(input_path: str, output_path: str):
    df = pd.read_csv(input_path)
    df = df.rename(columns={df.columns[0]: "question", df.columns[1]: "answer"})
    df = df.dropna()
    df.to_csv(output_path, index=False)
    print(f"Preprocessed dataset saved to {output_path}")

if __name__ == "__main__":
    preprocess_data("./data/train.csv", "./data/medical_faqs_clean.csv")


## Access token for LLM/chat_model.

* **Note:** We will use HuggingFace embeddings for storing data into vector DB and HuggingFace chat model to generate the response. This choice is done based on the requirement mentioned in the assignment file that the embeddings and chat-model should be free of cost. 

* To proceed, log in to huggingface website and get access token. Pass the token as "HUGGINGFACEHUB_API_TOKEN" theough the following code.

In [None]:
import getpass
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass(
    "Enter your Hugging Face API key: "
)

## Data Loader:
Using CSVLoader from LangChain to load the data.

In [4]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path = './data/medical_faqs_clean.csv',  encoding="utf-8")

docs = loader.load()


In [None]:
print(len(docs))
print(docs[0])

## Chunking:

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
splits = text_splitter.split_documents(docs)

In [None]:
print(len(splits))
for i in range(10):
    print(splits[i])


## Database Preparation:
Save the chunked data into the vector DB. We are using chromaDB as our vector database.

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Example: SentenceTransformer model trained on LLaMA-style embeddings
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_function)

In [None]:
print(vectorstore._collection.count())
print(vectorstore._collection.get())

## Retriever:

In [13]:
retriever = vectorstore.as_retriever()

## LLM Prompt for augmentation

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")
print(prompt)

## Instantiating Chat-HuggingFace:

In [15]:
from langchain_huggingface import ChatHuggingFace

llm = HuggingFaceEndpoint(
    repo_id="deepseek-ai/DeepSeek-R1-0528",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
    provider="auto",  # let Hugging Face choose the best provider for you
)

chat_model = ChatHuggingFace(llm=llm)

## Preparation for RAG Pipelining:

In [17]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [18]:
# To join the strings drawn from the retriever and get a signle string context
def format_docs(docs):
  return "\n".join(doc.page_content for doc in docs)

In [19]:
# To ignore the "thinking" outputs from LLM response and extract the final answer to the query.
def clean_response(response: str) -> str:
    """
    Remove reasoning traces (<think>...</think>) and only return the final answer.
    """
    import re
    # Remove everything between <think>...</think>
    response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
    # If the model prepends "Answer:" keep only that part
    if "Answer:" in response:
        response = response.split("Answer:")[-1].strip()
    return response.strip()

## RAG pipeline

In [20]:
rag_chain = ({"context" : retriever | format_docs, "question" : RunnablePassthrough()}
             | prompt
             | chat_model
             | StrOutputParser() | clean_response)

In [None]:
rag_chain.invoke("What is corticosteroids used for?")

## App demo using gradio:

In [None]:
import gradio as gr

def rag_chat(query):
    if not query.strip():
        return "Please enter a valid query."
    return rag_chain.invoke(query)

# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🏥 Medical RAG Chatbot")
    gr.Markdown("### Welcome 👋. Ask your medical questions below:")

    with gr.Row():
        query_box = gr.Textbox(label="Your Question", placeholder="Type your query here...")
    with gr.Row():
        output_box = gr.Textbox(label="RAG Answer")

    submit_btn = gr.Button("Submit")

    submit_btn.click(fn=rag_chat, inputs=query_box, outputs=output_box)

# Launch app
demo.launch(share=True)