In [None]:
%pwd

'e:\\Project\\Learn\\GenAI\\LLM_Med_Chatbot\\research'

In [11]:
import os 
# get out of the research directory
os.chdir("../")

In [12]:
%pwd

'e:\\Project\\Learn\\GenAI\\LLM_Med_Chatbot'

In [13]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
# Extract the Data from PDF File:

def load_pdf_file(data):
    loader = DirectoryLoader(data, 
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

In [15]:
extracted_data = load_pdf_file(data='Data/')

In [16]:
len(extracted_data)

637

In [17]:
# Perform Text Splitting:
def text_extraction(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                   chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [18]:
text_chunks = text_extraction(extracted_data)
len(text_chunks)

5860

In [3]:
# Perform Vector Embedding:
from langchain.embeddings import HuggingFaceEmbeddings
# Download the Embeddings from Hugging Face:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') # Return 384 dimensional embeddings
    return embeddings

In [4]:
embeddings = download_hugging_face_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
# Checking the returned embeddings len
query_result = embeddings.embed_query("Hello World")
len(query_result)

384

In [7]:
# Storing the vector embeddings in a pinecone 

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone  import ServerlessSpec
import os 
from dotenv import load_dotenv
load_dotenv()

pc=Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = 'medicalbot'

pc.create_index(name=index_name, 
                dimension=384,
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1',
                ))

In [8]:
import os 
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY']=PINECONE_API_KEY

In [20]:
# Upserting the embeddings in the pinecone index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name
)

In [21]:
# Loading the Existing Index from PineconeDB:
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name,
                                                    embedding=embeddings)

In [22]:
type(docsearch)

langchain_pinecone.vectorstores.PineconeVectorStore

In [23]:
# Using the docsearch as retriver
retriever = docsearch.as_retriever(search_type='similarity',
                                   search_kwargs={'k': 2})

In [24]:
retrived_docs = retriever.invoke("What is Acne?")

In [None]:
# Querying the Pinecone Index
retrived_docs

[Document(id='a3ab2b9c-c0e0-4d93-a0da-03292c294641', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='3603685d-d442-4343-994c-4293830118f9', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [30]:
# Integerating the LLM MOdel 
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY']=OPENAI_API_KEY

from langchain_openai import OpenAI
llm = OpenAI(temperature=0.3,
             max_tokens=500)

In [None]:
# Creating a LLM RAG

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate



In [34]:
# Creating a system prompt

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ('system',system_prompt),
    ('human',"{input}"),
])

In [37]:
# Creating RAG
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [38]:
# Getting a response
response = rag_chain.invoke({'input':'What is Acne?'})
print(response['answer'])



Acne is a skin disorder that involves inflammation of the sebaceous glands. It is also known as acne vulgaris and is commonly seen on the face. It is caused by a variety of factors, including hormonal changes and bacteria.
