In [1]:
%pwd


'a:\\GenAI_Project\\First_Nurse\\research'

In [2]:
import os
os.chdir("../")

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [4]:
import sys
print(sys.executable)


c:\Users\chand\anaconda3\envs\medibot\python.exe


In [5]:
# Extract text from pdf file
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob = "*.pdf",
        loader_cls = PyPDFLoader
    )

    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf_files("data")

In [7]:
len(extracted_data)

637

In [8]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of documents, return a new list of Document
    objects containing only 'source' in metadata and the original page_conent.
    """
    minimal_docs : List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata = {"source" : src}
            )
        )
    return minimal_docs
    

In [9]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [10]:
# splitting the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20, #to understand the context
        length_function = len
    )
    texts = text_splitter.split_documents(minimal_docs)
    return texts

In [11]:
texts_chunk = text_split(minimal_docs)
print(f"Number of texts: {len(texts_chunk)}")

Number of texts: 5859


In [12]:
#embedding
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name,
        
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [13]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
vector = embedding.embed_query("am in love with you")

In [15]:
len(vector)

384

In [16]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [17]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:


from openai import OpenAI
client = OpenAI(api_key = OPENAI_API_KEY)

In [20]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)


In [21]:
pc

<pinecone.pinecone.Pinecone at 0x29f3c1a6320>

In [22]:
#creating database
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(cloud = "aws", region = "us-east-1")


    )
index = pc.Index(index_name)

In [23]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = texts_chunk,
    embedding = embedding,
    index_name = index_name,

)

In [24]:
#LOad existing documents
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embedding
)

In [25]:
# now make RAG 


In [26]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k" : 3})


In [27]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='22748e44-6726-427c-8fac-2f6d1372ed6b', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='1f873c99-c6fd-458f-af74-c5e6602c83bd', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='6e10f2f4-5219-452b-b499-8bd96f5281ef', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26')]

In [28]:
from langchain_openai import ChatOpenAI

#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
chatModel = ChatOpenAI(model = "gpt-4o-mini",
                       openai_api_key= OPENAI_API_KEY)


In [29]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [30]:
#prompt

system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [31]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [32]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of a hormone from the pituitary gland, leading to excessive growth of bones and soft tissues, along with various other health disturbances. Gigantism is a related condition that occurs in children, resulting in excessive height and growth due to the same underlying hormone imbalance before the growth plates close. Both conditions are caused by an overproduction of growth hormone.


In [33]:
response = rag_chain.invoke({"input" : "what is acne"})
print(response["answer"])

Acne is a common skin condition that occurs when hair follicles become blocked with oil and dead skin cells. It often leads to the formation of pimples, blackheads, and cysts, primarily on the face, back, and shoulders. Hormonal changes, bacteria, and certain medications can contribute to its development.
