In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [3]:
extracted_data=load_pdf_file(data='Data/')

In [4]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [5]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 39994


In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

In [8]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [9]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [11]:
query_result

[-0.03447727486491203,
 0.031023185700178146,
 0.006734943483024836,
 0.026108991354703903,
 -0.039362020790576935,
 -0.16030248999595642,
 0.06692396104335785,
 -0.0064414748921990395,
 -0.04745050519704819,
 0.01475885696709156,
 0.07087533175945282,
 0.05552756041288376,
 0.019193293526768684,
 -0.026251325383782387,
 -0.010109511204063892,
 -0.026940522715449333,
 0.02230747975409031,
 -0.02222663350403309,
 -0.14969263970851898,
 -0.017493100836873055,
 0.007676226552575827,
 0.05435233935713768,
 0.003254474140703678,
 0.03172595798969269,
 -0.08462139964103699,
 -0.029405970126390457,
 0.05159565806388855,
 0.04812402278184891,
 -0.0033147968351840973,
 -0.05827918276190758,
 0.041969284415245056,
 0.022210702300071716,
 0.12818880379199982,
 -0.022338924929499626,
 -0.011656263843178749,
 0.06292839348316193,
 -0.032876282930374146,
 -0.09122605621814728,
 -0.03117532655596733,
 0.05269956961274147,
 0.04703480750322342,
 -0.08420305699110031,
 -0.030056172981858253,
 -0.020744

In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [25]:
import os

# Define API keys as strings
PINECONE_API_KEY = "pcsk_5phmf8_8NbPuMCYsCdQY5HL4ib65AHiefJAPU4snr7NQCZzqWdGARf1cqiJnkaMKLNaeL"
OPENAI_API_KEY = "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"

# Set environment variables
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


In [27]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [28]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [29]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [30]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x332bbddf0>

In [31]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
retrieved_docs = retriever.invoke("What is Acne?")

In [33]:
retrieved_docs

[Document(id='0c864cb9-0a9c-4b28-8d38-004ca96917e9', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='5e71bd0a-32ac-4a2d-8869-d606b0a92d6c', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSeb

In [39]:
!ollama pull mistral


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠇ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠏ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
[?2026h[?25l[A[1Gpulling manifest [K     ▏    0 B/4.1 GB                  [K[?25h[?2026l
[?2026h[?25l[A[1Gpulling manifest [K     ▏    0 B/4.1 GB                  [K[?25h[?2026l
[?2026h[?25l[A[1Gpulling manifest [K     ▏    0 B/4.1 GB                  [K[?25h[?2026l
[?2026h[?25l[A[1Gp

In [41]:
from langchain_community.llms import Ollama  # Correct import

# Load the Ollama model
llm = Ollama(model="mistral")  # Uses Mistral model

# Test if the model is working
response = llm.invoke("What is Artificial Intelligence?")
print(response)


  llm = Ollama(model="mistral")  # Uses Mistral model


 Artificial Intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their ability to learn, reason, problem-solve, perceive, and make decisions. AI can be categorized into two main types: Narrow AI, which is designed to perform a specific task, such as voice recognition or driving a car; and General AI, which has the ability to understand, learn, and apply knowledge across a wide range of tasks at a level equal to or beyond human capability. AI systems use various techniques from computer science, mathematics, psychology, linguistics, and other fields to achieve their goals.


In [42]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [43]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [44]:
# ✅ Create Retrieval-Augmented Generation (RAG) Chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [45]:
# ✅ Ask Questions
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print("Answer:", response["answer"])

Answer:  Acromegaly is a disorder caused by excessive growth hormone production from the pituitary gland. This leads to abnormal growth in bones and soft tissues throughout the body. Gigantism, on the other hand, is a similar condition that occurs during childhood, resulting in unusually tall stature due to overproduction of growth hormone before the growth plates in long bones close.


In [46]:
response = rag_chain.invoke({"input": "What is stats?"})
print("Answer:", response["answer"])

Answer:  Statistics is a branch of mathematics that deals with the collection, analysis, interpretation, and presentation of data. In psychology, statistics is often used to understand patterns and trends in behavior or performance, such as scores on standardized tests like the Stanford-Binet scale. The context provided explains two statistical concepts: standardization (the process of creating a reference point for test results) and standard deviation (a measure of how far scores are from the average).
