In [2]:
print("Hello World!")

Hello World!


In [3]:
%pip install --upgrade jupyter
%pip install --upgrade ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import Pinecone
import pinecone
# from langchain.embeddings import Embeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [5]:
# PINECONE_API_KEY = "5851aa9a-ea47-4c66-a323-bdbfeed95371"
import os
from dotenv import load_dotenv
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

In [6]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    document = loader.load()
    return document

In [7]:
import os
pdf_path = os.path.join('..', 'data')
extracted_data = load_pdf(pdf_path)

In [8]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks  


In [9]:
text_chunks = text_split(extracted_data)
print("Chunk Length: ", len(text_chunks))

Chunk Length:  7020


In [10]:
# Download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


In [14]:
query_result

[-0.03447718918323517,
 0.031023189425468445,
 0.006734984926879406,
 0.026109009981155396,
 -0.03936197981238365,
 -0.16030246019363403,
 0.06692398339509964,
 -0.006441500969231129,
 -0.04745054990053177,
 0.01475889328867197,
 0.07087533175945282,
 0.05552756413817406,
 0.01919332891702652,
 -0.02625131793320179,
 -0.01010945811867714,
 -0.026940524578094482,
 0.022307483479380608,
 -0.022226670756936073,
 -0.14969268441200256,
 -0.017493031919002533,
 0.007676261011511087,
 0.05435234680771828,
 0.0032544753048568964,
 0.03172600269317627,
 -0.0846213549375534,
 -0.029405953362584114,
 0.05159570649266243,
 0.0481240414083004,
 -0.003314815927296877,
 -0.05827920883893967,
 0.04196928068995476,
 0.022210700437426567,
 0.1281888484954834,
 -0.02233891189098358,
 -0.011656275950372219,
 0.06292835623025894,
 -0.032876234501600266,
 -0.09122606366872787,
 -0.031175419688224792,
 0.05269957706332207,
 0.0470348596572876,
 -0.08420299738645554,
 -0.030056182295084,
 -0.02074472792446613

In [15]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medical-chatbot"

In [16]:
# Creating embedding for each of the text chunks and storing
# docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name = index_name)
docsearch = LangchainPinecone.from_texts(
    texts=[t.page_content for t in text_chunks],
    embedding=embeddings,
    index_name=index_name,
)   


In [17]:
docsearch = LangchainPinecone.from_existing_index(index_name, embeddings)
query = "What are allergies ?"
docs = docsearch.similarity_search(query, k = 3)
print("Result", docs)

Result [Document(page_content='the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .\nThe particular allergens to which a person is sensi-'), Document(page_content='the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .\nThe particular allergens to which a person is sensi-'), Document(page_content='the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .\nThe particular allergens to which a person is sensi-')]


In [4]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [5]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [9]:
# from huggingface_hub import login
# login(token="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")

llm = CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={"max_new_tokens":512,
                            'temperature':0.8})

In [1]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

NameError: name 'RetrievalQA' is not defined

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

Response :  The disease caused by a lack of vitamin C is known as scurvy.
Response :  Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
