In [1]:
# Moving one directory up

import os
os.chdir("../")

In [2]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", category=DeprecationWarning)

# Text Splitting

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract data from pdfs in the provided file directory
def pdf_extractor(directory):
    loader = DirectoryLoader(directory, glob="*.pdf", loader_cls=PyPDFLoader)
    return loader.load()


# Spliting data into chuncks
def text_split(data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return splitter.split_documents(data)


In [7]:
extracted_data = pdf_extractor("data/")

In [8]:
chunks = text_split(extracted_data)
print(f"Number of Chunks: {len(chunks)}")

Number of Chunks: 5860


# Embeddings

In [7]:
# Initializing the embedding model
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [10]:
# Checking if embedding model returns a length of 384
result = embeddings.embed_query("Hello world")
print(f"Length: {len(result)}")

Length: 384


# Database

In [4]:
# Loading PineCone API Key from .env
from dotenv import load_dotenv

load_dotenv()
os.environ["PINECONE_API_KEY"] = os.environ.get("PINECONE_API_KEY")

In [None]:
# Initializing and setting up the Pinecone database for use
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key = os.environ["PINECONE_API_KEY"])
db_name = "medibot"

pc.create_index(
    name=db_name,
    dimension=384, # Size of the embeddings model output
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

Error while installing plugin inference: can't set attribute 'inference'
Traceback (most recent call last):
  File "c:\Users\ACER\Desktop\Code\Medical ChatBot\Medical-ChatBot\.venv\lib\site-packages\pinecone_plugin_interface\actions\installation.py", line 13, in install_plugins
    setattr(target, plugin.namespace, impl(target.config, plugin_client_builder))
AttributeError: can't set attribute 'inference'


{
    "name": "medibot",
    "metric": "cosine",
    "host": "medibot-1w80f6q.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
# Adding each chunk and its embedding into Pinecone database initialized earlier as indexes
from langchain_pinecone import PineconeVectorStore

doc_search = PineconeVectorStore.from_documents(
    documents=chunks,
    index_name=db_name,
    embedding=embeddings
)

Error while installing plugin inference: can't set attribute 'inference'
Traceback (most recent call last):
  File "c:\Users\ACER\Desktop\Code\Medical ChatBot\Medical-ChatBot\.venv\lib\site-packages\pinecone_plugin_interface\actions\installation.py", line 13, in install_plugins
    setattr(target, plugin.namespace, impl(target.config, plugin_client_builder))
AttributeError: can't set attribute 'inference'


# Quering

In [5]:
# Loading PineCone API Key from .env
from dotenv import load_dotenv

load_dotenv()
os.environ["PINECONE_API_KEY"] = os.environ.get("PINECONE_API_KEY")

In [8]:
# Loading indexes
from langchain_pinecone import PineconeVectorStore

db_name = "medibot"

doc_search = PineconeVectorStore.from_existing_index(
    index_name=db_name,
    embedding=embeddings
)

In [9]:
# Initializing retriever object
retriever = doc_search.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)

In [19]:
retriever.invoke("What is acne?")

[Document(id='326333cc-7ae1-4a2a-8122-7dca07acebb2', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\the_gale_encyclopedia_of_medicine_volume_ii.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='9e32c910-f367-4ca8-8322-64164e5785a5', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\the_gale_encyclopedia_of_medicine_volume_ii.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\na

# Integrating LLM

In [11]:
# Loading GENAI key from .env
from dotenv import load_dotenv

load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.environ.get("GENAI_API_KEY")

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_retries=2,
)

In [36]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Creating system engineered prompt to better construct answers
sys_prompt = ('''
              You are an assistant used for question-answering tasks.
              You the provided piecies of retreived context to use in answering questions given to you.
              If you do not know the answer to the asked question, say you don't know, instead of saying anything else.
              Be as concise with your answer as possible while providing all the relevent details to answer the question.
              Try to keep your answer under three sentences.
              \n\n
              {context}
''')

# Constructing prompt to be used in chain
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", sys_prompt),
        ("human", "{input}")
    ]
)

# Initializing prompt chain
chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, chain)

In [30]:
# Testing responses on topics convered in the book
response = rag_chain.invoke({"input": "What is acne?"})
print(response["answer"])

Acne, medically known as acne vulgaris, is a common skin disease characterized by pimples on the face, chest, and back.  It occurs when pores become clogged with oil, dead skin cells, and bacteria.


In [37]:
# Testing responses on topics not convered in the book
response = rag_chain.invoke({"input": "What is statistics?"})
print(response["answer"])

This document discusses blood counts and does not contain the answer to your question.
