In [1]:
print("Hello!")

Hello!


In [6]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings

from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.pinecone import Pinecone
from langchain.vectorstores import Chroma

In [3]:
# Extract the data from the pdf book
def load_data(data):
    loader = DirectoryLoader(data,
    glob="*.pdf",
    loader_cls=PyPDFLoader)

    documents = loader.load()
    return documents
    

In [4]:
extracted_pdf = load_data("/Users/anuragtrivedi/SAXONY_AI/SAMSONChatbot/Data/")
print("Data Extracted")

Data Extracted


In [5]:
#create text Chunks
def text_split(extracted_pdf):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                   chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_pdf)
    return text_chunks 

In [6]:
text_chunks = text_split(extracted_pdf)
print("chunks created")
print("The number of chunks is:",len(text_chunks))

chunks created
The number of chunks is: 85


In [7]:
#download embedding model
def download_huggingfaceembedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [8]:
embedding = download_huggingfaceembedding()

In [9]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
query_result = embedding.embed_query("Hello World!")
print("Length:", len(query_result))
query_result

Length: 384


[-0.020386872813105583,
 0.025280868634581566,
 -0.0005661954055540264,
 0.011615422554314137,
 -0.03798840939998627,
 -0.11998124420642853,
 0.04170947149395943,
 -0.02085716463625431,
 -0.0590067133307457,
 0.024232538416981697,
 0.06212018430233002,
 0.06767993420362473,
 0.03310025855898857,
 -0.010369361378252506,
 -0.03121568262577057,
 -0.032733261585235596,
 -0.0021117450669407845,
 0.009261966682970524,
 -0.12476461380720139,
 0.01123682502657175,
 0.039045464247465134,
 0.05440254509449005,
 -0.00282548600807786,
 0.04455624520778656,
 -0.0854201689362526,
 -0.022873707115650177,
 0.039140596985816956,
 0.036046866327524185,
 -0.03212682530283928,
 -0.06425875425338745,
 0.05812908709049225,
 0.04669087007641792,
 0.08061559498310089,
 -0.007734288927167654,
 -0.02208317629992962,
 0.06713155657052994,
 -0.04504146799445152,
 -0.10212123394012451,
 0.0012643603840842843,
 0.04680192098021507,
 0.026395879685878754,
 -0.06990958750247955,
 -0.04453349485993385,
 -0.00690193707

In [11]:
# save to disk
query = "What did the president say about Ketanji Brown Jackson"
db2 = Chroma.from_documents(text_chunks, embedding, persist_directory="./chroma_db")
docs = db2.similarity_search(query)

# load from disk
db3 = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
docs = db3.similarity_search(query)
print(docs[0].page_content)

400 16 352 760 1180 1803 2669 4085 5486 6341
450 18 458 987 1537 2348 3478 5349 7177 8295
500 20 555 1195 1853 2832 4192 6422 8617 9961
600 24 810 1747 2708 4139 6126 9386 12593 14559
700 28 1099 2369 3674 5614 8309 12730 17082 19741
800 32 1449 3125 4845 7403 10957 16787 22525 26035
900 36 1848 3981 6170 9429 13956 21375 28685 33155
1000 40 2275 4902 7605 11619 17195 25175 35345 40850
1200 48 3278 7068 10963 17148 24790 36043 50963 58900
1300 52 4005 8610 13345 20419 29608 42932 60105 69121


In [11]:
# # load it into Chroma
# db = Chroma.from_documents(text_chunks, embedding)

# # query it
# query = "What did the president say about Ketanji Brown Jackson"
# docs = db.similarity_search(query)

In [7]:
#from langchain_community.llms import CTransformers

llm = CTransformers(model="/Users/anuragtrivedi/samson-chatbot/model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={"max_new_tokens":512,
                            'temperature':0.8})

In [8]:
print(llm("AI is going to"))

  warn_deprecated(


 be a big deal in the future, and it’s important that we get it right. everybody needs to be aware of how it’s going to impact their lives, whether they work in healthcare or not.”
— Dr. Eric Topol (@EricTopol) March 10, 2023
Dr. Eric Topol is a well-known cardiologist and digital medicine expert who has been at the forefront of the field for many years. In this tweet, he emphasizes the importance of educating people about AI in healthcare, particularly those outside of the industry. He notes that AI will have a significant impact on everyone's lives, regardless of their profession or occupation. By raising awareness and promoting understanding, we can ensure that people are prepared for the changes that AI will bring.
Dr. Topol is not alone in his concerns about the need for education and transparency in AI development and deployment. Many experts have called for greater investment in AI literacy programs, both within the healthcare industry and beyond. As AI becomes more integrated i

In [22]:
prompt_template = """
Use the following piece of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to generate any random answer from your own

Context:{context}
Question:{question}

Only return the helpful answer and nothing else
helpful answer:
"""

In [23]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
chain_type_kwargs = {"prompt":PROMPT}

In [26]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever= db.as_retriever(search_kwargs={"k":2}),
    return_source_documents = True,
    chain_type_kwargs=chain_type_kwargs)

In [27]:
while True:
    user_input = input(f"Input Prompt: ")
    result = qa({"query":user_input})
    print("Response:", result["result"])