In [42]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
import pinecone
from langchain.document_loaders import PyMuPDFLoader, PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import os

In [17]:
load_dotenv()

True

In [18]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [19]:
# Extract data from the pdf
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls= PyPDFLoader
    )
    documents = loader.load()
    return documents

In [20]:
extract_data = load_pdf("data/")

In [21]:
len(extract_data)

637

In [22]:
# split text into chunks
def text_split(extracted_data):
   #set chunk size and lap
   text_splitter =  RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
   # start to split data base on text_splitter
   text_chunks = text_splitter.split_documents(extracted_data)

   return text_chunks

In [23]:
chunks = text_split(extract_data)
len(chunks)

5860

In [24]:
# download embedding
def download_OpenAIEmbedings():
    embeddings = OpenAIEmbeddings()
    return embeddings

In [25]:
embeddings = download_OpenAIEmbedings()
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000023FC61F9970>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000023FC62C7970>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [26]:
query_result = embeddings.embed_query('Hello world')
len(query_result)

1536

In [27]:
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("mchatbot")

In [28]:
chunks[0]

Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 1}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION')

In [29]:
def pinecone_upsert(chunks,embeddings):
    vectors = []
    for i, chunk in enumerate(chunks):
        text = chunk.page_content
        values = embeddings.embed_query(text)
        metadata = chunk.metadata
        metadata["text"]=text
        vector_data ={
            "id":f"chunk{i}",
            "values":values,
            "metadata":metadata
        }
        vectors.append(vector_data)
    return vectors
        

In [30]:
#vectors_data = pinecone_upsert(chunks=chunks, embeddings=embeddings)

In [31]:
#len(vectors_data)

In [32]:
''' batch_size = 150
for i in range(0, len(vectors_data), batch_size):
    batch = vectors_data[i : i + batch_size]
    index.upsert(vectors=batch, namespace='ns1')'''

" batch_size = 150\nfor i in range(0, len(vectors_data), batch_size):\n    batch = vectors_data[i : i + batch_size]\n    index.upsert(vectors=batch, namespace='ns1')"

In [33]:
'''def query_pinecone(index,question,embeddings):
    question_embed = embeddings.embed_query(question)
    result = index.query(
    namespace = "ns1",
    vector = question_embed,
    top_k = 2,
    include_value = False,
    include_metadata = True
)
    return result
    this is official query from pinecone website
    '''

'def query_pinecone(index,question,embeddings):\n    question_embed = embeddings.embed_query(question)\n    result = index.query(\n    namespace = "ns1",\n    vector = question_embed,\n    top_k = 2,\n    include_value = False,\n    include_metadata = True\n)\n    return result\n    this is official query from pinecone website\n    '

In [34]:
from langchain_pinecone import Pinecone

In [35]:
'''
vectorstore = Pinecone(
    index=index,                     
    embedding=embeddings,
    text_key="text"
)
'''

'\nvectorstore = Pinecone(\n    index=index,                     \n    embedding=embeddings,\n    text_key="text"\n)\n'

In [36]:
docsearch = Pinecone.from_existing_index(
    index_name="mchatbot",
    embedding=embeddings,
    namespace="ns1",  
    text_key="text"
    )

In [48]:
query = "What are Allergies"

In [50]:
docs=docsearch.similarity_search(query, k=3)
docs

[Document(metadata={'page': 128.0, 'source': 'data\\Medical_book.pdf'}, page_content='Description\nAllergies are among the most common of medical\ndisorders. It is estimated that 60 million Americans, or\nmore than one in every five people, suffer from some\nform of allergy, with similar proportions throughout\nmuch of the rest of the world. Allergy is the single largest\nreason for school absence and is a major source of lost\nproductivity in the workplace.\nAn allergy is a type of immune reaction. Normally,\nthe immune system responds to foreign microorganisms'),
 Document(metadata={'page': 127.0, 'source': 'data\\Medical_book.pdf'}, page_content='1995.\nNovick, N. L. You Can Do Something About Your Allergies.\nNew York: Macmillan, 1994.\nWeil, A. Natural Health, Natural Medicine: A Comprehensive\nManual for Wellness and Self-Care.New York: Houghton\nMifflin, 1995.\nRichard Robinson\nAllergies\nDefinition\nAllergies are abnormal reactions of the immune sys-\ntem that occur in respons

In [43]:
llm=ChatOpenAI(model_name = 'gpt-4.5-preview')

In [56]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True)


In [53]:
qa.invoke("What are Allergies ")

{'query': 'What are Allergies ',
 'result': 'Allergies are abnormal reactions of the immune system that occur in response to otherwise harmless substances. Normally, the immune system protects the body by responding to harmful foreign microorganisms. However, in people with allergies, the immune system mistakenly identifies harmless substances, known as allergens, as threats, triggering an unnecessary immune response. Allergies are among the most common medical disorders, affecting millions worldwide, and are a significant cause of missed school days and reduced workplace productivity.',
 'source_documents': [Document(metadata={'page': 128.0, 'source': 'data\\Medical_book.pdf'}, page_content='Description\nAllergies are among the most common of medical\ndisorders. It is estimated that 60 million Americans, or\nmore than one in every five people, suffer from some\nform of allergy, with similar proportions throughout\nmuch of the rest of the world. Allergy is the single largest\nreason fo

In [57]:
user_input = input(f"input prompt:")
result = qa.invoke(user_input)

In [59]:
print(result['result'])

Allergies are reactions of the immune system in response to certain foreign substances, known as allergens, such as pollen, dust, pet dander, or certain foods. Normally, the immune system helps protect the body by producing specific proteins called antibodies to identify and neutralize harmful microorganisms or particles. However, in the case of allergies, the immune system mistakenly identifies harmless substances as threats, triggering an exaggerated immune response. This reaction results in various symptoms, including sneezing, itching, swelling, runny nose, watery eyes, skin rashes, or even more severe reactions like difficulty breathing. Allergies are very common and affect millions of people worldwide, significantly impacting daily activities, school attendance, work productivity, and overall quality of life.
