In [234]:
import os 
os.chdir("research")

In [235]:
print("OK")

OK


In [236]:
%pwd 

'd:\\MEDICAL-CHATBOT-AI\\research'

In [237]:
import os 
os.chdir("../")

In [238]:
%pwd

'd:\\MEDICAL-CHATBOT-AI'

In [239]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [240]:
#Extract the data from the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob = "*.pdf",
                             loader_cls = PyPDFLoader)
    
    documents = loader.load()
    
    return documents

In [241]:
extracted_data = load_pdf_file(data = 'Data/')

In [242]:
#extracted_data

In [243]:
#Split the data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks 

In [244]:
text_chunks = text_split(extracted_data)
print("Length of text chunks : ", len(text_chunks))

Length of text chunks :  5859


In [245]:
#text_chunks

In [246]:
from langchain.embeddings import HuggingFaceEmbeddings

In [247]:
#Download the embeddings from the Hugging Face
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [248]:
embeddings = download_hugging_face_embeddings()

In [249]:
query_result = embeddings.embed_query("Hello World")
print("Length : ", len(query_result))

Length :  384


In [250]:
#query_result

In [251]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [258]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GEMINI_API_KEY=os.environ.get('GEMINI_API_KEY')

In [253]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )
    
index = pc.Index(index_name)

In [254]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY 


In [257]:
#Embeded each chunk and upsert the embeddings into Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

MaxRetryError: HTTPSConnectionPool(host='api.pinecone.io', port=443): Max retries exceeded with url: /indexes (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001D956B289D0>: Failed to resolve 'api.pinecone.io' ([Errno 11001] getaddrinfo failed)"))

In [None]:
#Load existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
#docsearch

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is acne?")
retrieved_docs

[Document(id='956cde51-a4fe-41a0-aa1d-eb56eeae7b44', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='681d3874-50df-47c6-851e-83ef638510b7', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='46896063-0b7d-4675-a7b5-be23cb06b107', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 

In [None]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI

# ✅ Set your Gemini API key manually
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

# ✅ Pass the API key explicitly
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.4,
    max_output_tokens=500,
    google_api_key=os.environ["GOOGLE_API_KEY"]  # 🔥 This is required
)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know te answer, say that you "
    "don't know. Use three statements maximum and keep the "
    "answer concise. "
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "What is Acromegaly and giganism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland.  This leads to increased bone and soft tissue growth, and other bodily disturbances.  Gigantism is also related to excess growth hormone, but the timing of the hormone excess differs from acromegaly.
