In [6]:
%pwd

'c:\\Users\\prana\\Downloads\\MedBot\\research'

In [7]:
import os
os.chdir("../")

In [8]:
%pwd

'c:\\Users\\prana\\Downloads\\MedBot'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
    glob="*.pdf",
    loader_cls = PyPDFLoader)

    documents = loader.load()
    return documents

In [9]:
extracted_data = load_pdf_file(data='Data/')

In [11]:
# extracted_data

In [14]:
def text_split(extractted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [15]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5859


In [16]:
from langchain.embeddings import HuggingFaceEmbeddings

In [23]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

embeddings = download_hugging_face_embeddings()

In [25]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


In [47]:
from dotenv import load_dotenv
load_dotenv()

True

In [48]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [28]:
# Creating Pinecone index using code instead of doing manual
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key = PINECONE_API_KEY)

index_name = "medbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "medbot",
    "metric": "cosine",
    "host": "medbot-hgkzndi.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [49]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [30]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [32]:
# Load Existing Index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)

In [33]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x246c6152a40>

In [34]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [35]:
retriever_docs = retriever.invoke("What is acne")

In [38]:
print(retriever_docs)

[Document(id='a1e3b34b-e265-4cc7-9656-21746ef58b41', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'), Document(id='25c250b7-4aad-4e1c-9fcd-663f99499c73', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria

In [50]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [51]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [53]:
system_prompt = (
    "You are an assistant for question answering tasks."
    "Use the following piece of retrieval context to answer"
    "the question. If you dont know the answer, say that you"
    "don't know. Use three sentences maximum and keep the"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [55]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [56]:
response = rag_chain.invoke({"input": "What is Acne"})
print(response["answer"])

?

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. It is the most common skin disease, affecting nearly 17 million people in the United States. Acne can arise at any age, but it is most common in teenagers and young adults.
