In [2]:
%pwd

'd:\\The Monarch\\Coding\\LLM Projects\\RAG-based-Medical-Chatbot\\research'

In [3]:
import os
# os.chdir("RAG-based-Medical-Chatbot")
os.chdir('../')

In [4]:
%pwd

'd:\\The Monarch\\Coding\\LLM Projects\\RAG-based-Medical-Chatbot'

In [12]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [14]:
extracted_data = load_pdf_file("data/")

In [15]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [16]:
text_chunks = text_split(extracted_data)
print("Length of text chunks:", len(text_chunks))

Length of text chunks: 5860


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
import sentence_transformers

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

In [18]:
query_result = embeddings.embed_query("Hello World")
print("Length of Query Embedding vector", len(query_result))

Length of Query Embedding vector 384


In [38]:
from dotenv import load_dotenv
load_dotenv()

True

In [39]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

In [40]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
os.environ['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY

In [25]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [27]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [28]:
# Load existing index
docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)

In [29]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x124dadab730>

In [34]:
retriever = docsearch.as_retriever(search_type='similarity_score_threshold', search_kwargs={'k' : 3, 'score_threshold': 0.6})

In [35]:
retrieved_docs = retriever.invoke('What is Acne?')
retrieved_docs

[Document(id='e3a13d8c-196e-4fba-a076-5e40886c40a4', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\the_gale_encyclopedia_of_medicine.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='b4aa9fd4-bccd-475a-b9de-f5ab1d6f9f07', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\the_gale_encyclopedia_of_medicine.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researche

In [36]:
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI

In [41]:
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0.4, max_tokens=500)

In [48]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. " \
    "Use the following pieces of retrieved context to answer " \
    "the question. If you don't know the answer, simply say that you " \
    "don't know. Use three sentences maximum and keep the answer concise." \
    "\n\n" \
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)

In [42]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [49]:
response = rag_chain.invoke({'input': "What Acromegaly and Gigantism?"})
print(response['answer'])

Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue. When this abnormality occurs before bone growth stops, it results in unusual height, known as gigantism. Acromegaly is relatively rare, affecting approximately 50 out of every one million people.
