In [1]:
print("OK")

OK


In [62]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone.vectorstores import PineconeVectorStore
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain import hub
import os

In [50]:
filename = "Python_Programming.pdf"

In [51]:
path = os.path.join("../data", filename)

In [52]:
print(path)

../data\Python_Programming.pdf


In [53]:
loader = PyPDFLoader(path)

In [45]:
loader.load()

EmptyFileError: Cannot read an empty file

In [40]:
pages = []

for page in loader.lazy_load():
    pages.append(page)

KeyboardInterrupt: 

In [26]:
pages[0].metadata

{'source': '../data/Gale.pdf', 'page': 0}

In [27]:
pages[3].page_content

'Introduction .................................................... ix\nAdvisory Board .............................................. xi\nContributors ................................................. xiii\nEntries\nVolume 1: A-B .............................................. 1\nVolume 2: C-F .......................................... 625\nVolume 3: G-M ....................................... 1375\nVolume 4: N-S ........................................ 2307\nVolume 5: T-Z ........................................ 3237\nOrganizations ............................................ 3603\nGeneral Index ............................................ 3625\nGALE ENCYCLOPEDIA OF MEDICINE 2 VCONTENTS'

In [38]:
from dotenv import load_dotenv
import os

load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
HF_TOKEN = os.getenv('HF_TOKEN')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [5]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob = "*.pdf", loader_cls = PyPDFLoader)

    document = loader.load()
    return document

In [7]:
extracted_data = load_pdf("../data/")

### Create text chunks

In [19]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [20]:
texts = text_split(pages)

In [23]:
print(texts[3])

page_content='Multimedia Content
Kelly A. Quin, Editor, Imaging and Multimedia Content
Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
Robert Duncan, Senior Imaging Specialist
Dan Newell, Imaging Specialist
Christine O’Bryan, Graphic Specialist
Maria Franklin, Permissions Manager
Margaret A. Chamberlain, Permissions Specialist
Michelle DiMercurio, Senior Art Director
Mike Logusz, Graphic Artist' metadata={'source': '../data/Gale.pdf', 'page': 2}


### Embedding text chunks

In [58]:
def download_hf_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [59]:
embedding = download_hf_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")


In [6]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [63]:
docsearch = PineconeVectorStore(index_name="resume-59e56b31", embedding=embedding)

In [65]:
import asyncio

asyncio.run(docsearch.asimilarity_search("What is my name?"))

NameError: name 'asyncio' is not defined

### Initialize Pinecone

In [17]:
index_name = "medical-chatbot"

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embedding, index_name = index_name)

In [18]:
docsearch = Pinecone.from_existing_index(index_name, embedding)

query = "What are allergies?"

docs = docsearch.similarity_search(query, k = 3)

print(docs)

[Document(page_content='ORGANIZATIONS\nAmerican Academy of Ophthalmology. 655 Beach Street, PO\nBox 7424, San Francisco, CA 94120-7424. <http://www.eyenet.org>.KEY TERMS\nAllergen —A substance capable of inducing an\nallergic response.\nAllergic reaction —An immune system reaction to\na substance in the environment; symptomsinclude rash, inflammation, sneezing, itchy wateryeyes, and runny nose.\nConjunctiva —The mucous membrane that covers\nthe white part of the eyes and lines the eyelids.'), Document(page_content='Although environmental medicine is gaining more\nrespect within conventional medicine, detoxificationKEY TERMS\nAllergen —A foreign substance, such as mites in\nhouse dust or animal dander, that wheninhaled,causes the airways to narrow and pro-duces symptoms of asthma.\nAntibody —A protein, also called immunoglobu-\nlin, produced by immune system cells to removeantigens (the foreign substances that trigger theimmune response).\nFibromyalgia —A condition of debilitating pain,

In [22]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [23]:
retriever.invoke("What are allergens?")

[Document(page_content='ORGANIZATIONS\nAmerican Academy of Ophthalmology. 655 Beach Street, PO\nBox 7424, San Francisco, CA 94120-7424. <http://www.eyenet.org>.KEY TERMS\nAllergen —A substance capable of inducing an\nallergic response.\nAllergic reaction —An immune system reaction to\na substance in the environment; symptomsinclude rash, inflammation, sneezing, itchy wateryeyes, and runny nose.\nConjunctiva —The mucous membrane that covers\nthe white part of the eyes and lines the eyelids.'),
 Document(page_content='Although environmental medicine is gaining more\nrespect within conventional medicine, detoxificationKEY TERMS\nAllergen —A foreign substance, such as mites in\nhouse dust or animal dander, that wheninhaled,causes the airways to narrow and pro-duces symptoms of asthma.\nAntibody —A protein, also called immunoglobu-\nlin, produced by immune system cells to removeantigens (the foreign substances that trigger theimmune response).\nFibromyalgia —A condition of debilitating pain

In [9]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")



In [10]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

### Loading the llama 3 model

In [39]:
from langchain_ollama import ChatOllama

local_llm = 'llama3.2'
llm = ChatOllama(model=local_llm)

In [41]:
print(llm.invoke("What is the capital of England"))

content="England does not have a capital city in the classical sense. The United Kingdom (UK) is made up of four countries: England, Scotland, Wales, and Northern Ireland.\n\nThe capital of England is often considered to be London, which is also the capital of the UK. However, this can be misleading as London is a global city that serves as the capital of multiple countries, including England, the UK, and the Commonwealth.\n\nIn practice, though, the capital of England is typically taken to be Birmingham or another major English city, but there isn't an official capital designated specifically for England." response_metadata={'model': 'llama3.2', 'created_at': '2024-10-04T08:38:53.2386198Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 14221404700, 'load_duration': 68159600, 'prompt_eval_count': 31, 'prompt_eval_duration': 167193000, 'eval_count': 120, 'eval_duration': 13951925000} id='run-785e2154-b8bb-47d8-b97b-92b39ae04f50-0

In [24]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

for chunk in rag_chain.stream("What are allergies?"):
    print(chunk, end="", flush=True)

Here is a concise answer:

Allergies are an immune system reaction to a substance in the environment that can cause symptoms such as rash, inflammation, sneezing, itchy watery eyes, and runny nose. Allergens are substances capable of inducing this reaction, which can be caused by exposure to foreign substances like dust mites or animal dander. Symptoms can range from mild to severe depending on the individual's sensitivity to the allergen.

In [25]:
for chunk in rag_chain.stream("What is cloud computing?"):
    print(chunk, end="", flush=True)

I don't have information on cloud computing. The retrieved context appears to be unrelated to this topic. I don't know what cloud computing is.

In [32]:
op = rag_chain.stream("What are allergens?")

In [33]:
a = list(op)

In [34]:
a

['Here',
 ' is',
 ' a',
 ' ',
 '3',
 ' sentence',
 ' answer',
 ':\n\n',
 'All',
 'erg',
 'ens',
 ' are',
 ' foreign',
 ' substances',
 ' that',
 ' can',
 ' induce',
 ' an',
 ' allergic',
 ' response',
 ' in',
 ' the',
 ' body',
 ',',
 ' such',
 ' as',
 ' m',
 'ites',
 ' in',
 ' house',
 ' dust',
 ' or',
 ' animal',
 ' d',
 'ander',
 '.',
 ' These',
 ' substances',
 ' trigger',
 ' an',
 ' immune',
 ' response',
 ' that',
 ' infl',
 'ames',
 ' the',
 ' skin',
 ' and',
 ' causes',
 ' symptoms',
 ' like',
 ' rash',
 ',',
 ' inflammation',
 ',',
 ' sne',
 'ez',
 'ing',
 ',',
 ' it',
 'chy',
 ' wat',
 'ery',
 ' eyes',
 ',',
 ' and',
 ' run',
 'ny',
 ' nose',
 '.',
 ' Common',
 ' allerg',
 'ens',
 ' include',
 ' poison',
 ' iv',
 'y',
 ',',
 ' frag',
 'rances',
 ',',
 ' pres',
 'ervatives',
 ',',
 ' latex',
 ' items',
 ',',
 ' and',
 ' certain',
 ' chemicals',
 ' and',
 ' food',
 ' additives',
 '.',
 '']

In [35]:
''.join(a)

'Here is a 3 sentence answer:\n\nAllergens are foreign substances that can induce an allergic response in the body, such as mites in house dust or animal dander. These substances trigger an immune response that inflames the skin and causes symptoms like rash, inflammation, sneezing, itchy watery eyes, and runny nose. Common allergens include poison ivy, fragrances, preservatives, latex items, and certain chemicals and food additives.'