In [1]:
%pwd

'c:\\Users\\rohan\\OneDrive\\Documents\\Masters Project\\Medical-chatbot\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\rohan\\OneDrive\\Documents\\Masters Project\\Medical-chatbot'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [5]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents=loader.load()

    return documents

In [6]:
extracted_data = load_pdf(data='data/')

In [7]:
# extracted_data

In [8]:
#Split the Data into text chunks

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 8706


In [10]:
#Download the Embeddings from Hugging Face
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

embeddings = download_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [11]:
result = embeddings.embed_query("Hello World")
print("Length", len(result)) 

Length 384


In [12]:
#result

In [41]:
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [15]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key= PINECONE_API_KEY)

index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [42]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone.vectorstores import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [18]:
# Load existing index 

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding= embeddings
)

In [19]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x21a4655bfd0>

In [34]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [35]:
retrieved_docs = retriever.invoke("What is acne?")

In [36]:
retrieved_docs


[Document(id='7b8a3903-7849-4e38-9591-ccff7edcde5a', metadata={'page': 685.0, 'source': 'data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='rare disorder characterized by an abnormal increase in\nthe number of mature red cells in the blood.\nGALE ENCYCLOPEDIA OF MEDICINE 2 2981\nSecondary polycythemia\nThis young boy is afflicted with seborrheic dermatitis.(Cus-\ntom Medical Stock Photo. Reproduced by permission.)\nKEY TERMS\nAcne—A chronic inflammation of the sebaceous\nglands that manifests as blackheads, whiteheads,\nand/or pustules on the face or trunk.\nPsoriasis—A skin disorder of chronic, itchy scaling'),
 Document(id='5994d42a-b666-48ee-917e-ca21866648d3', metadata={'page': 622.0, 'source': 'data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='matologist will attempt to rule out a number of other dis-\neases that have similar symptoms. Acne vulgaris is per-\nhaps the disorder most commonly mistaken for rosacea,\nbut redness and spider-like ve

In [43]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens= 500)

In [44]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question "
    "If you don't know the answer, say that you don't know. "
    "Use maximum three sentences and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [45]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [46]:
response = rag_chain.invoke({"input": "What is acne?"})
print(response["answer"])

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [48]:
import streamlit as st

In [53]:
def set_retriever():
    """Set up the retriever to fetch documents from Pinecone."""
    docsearch = PineconeVectorStore.from_existing_index(
        index_name=index_name,
        embedding=embeddings
    )
    retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    # Debugging line to check the retriever output

In [54]:
print("Retriever initialized: ",retriever)


Retriever initialized:  tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'] vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x0000021A4655BFD0> search_kwargs={'k': 3}
